Skip to content

Casting

pl.Expr.cast serves as the primary function for type conversion in Polars. It includes a keyword argument, strict, which, by default, is set to True and will raise an exception if a conversion error occurs. Alternatively, you can set strict=False. In this case, if a conversion error occurs, the values will be set to null.

Setup

import pandas as pd
import polars as pl

data = {
    "integers": [1, 2, 3, 4, 5],
    "big_integers": [1, 10000002, 3, 10000004, 10000005],
    "floats": [4.0, 5.0, 6.0, 7.0, 8.0],
    "floats_with_decimal": [4.532, 5.5, 6.5, 7.5, 8.5],
    "floats_as_string": ["4.0", "5.0", "6.0", "7.0", "8.0"],
    "strings_not_float": ["4.0", "not_a_number", "6.0", "7.0", "8.0"],
    "bools": [True, False, True, False, True],
}

df_pl = pl.DataFrame(data)
print(df_pl)

shape: (5, 7)
┌──────────┬──────────────┬────────┬─────────────────────┬──────────────────┬───────────────────┬───────┐
│ integers ┆ big_integers ┆ floats ┆ floats_with_decimal ┆ floats_as_string ┆ strings_not_float ┆ bools │
│ ---      ┆ ---          ┆ ---    ┆ ---                 ┆ ---              ┆ ---               ┆ ---   │
│ i64      ┆ i64          ┆ f64    ┆ f64                 ┆ str              ┆ str               ┆ bool  │
╞══════════╪══════════════╪════════╪═════════════════════╪══════════════════╪═══════════════════╪═══════╡
│ 1        ┆ 1            ┆ 4.0    ┆ 4.532               ┆ 4.0              ┆ 4.0               ┆ true  │
│ 2        ┆ 10000002     ┆ 5.0    ┆ 5.5                 ┆ 5.0              ┆ not_a_number      ┆ false │
│ 3        ┆ 3            ┆ 6.0    ┆ 6.5                 ┆ 6.0              ┆ 6.0               ┆ true  │
│ 4        ┆ 10000004     ┆ 7.0    ┆ 7.5                 ┆ 7.0              ┆ 7.0               ┆ false │
│ 5        ┆ 10000005     ┆ 8.0    ┆ 8.5                 ┆ 8.0              ┆ 8.0               ┆ true  │
└──────────┴──────────────┴────────┴─────────────────────┴──────────────────┴───────────────────┴───────┘

df_pd = pd.DataFrame(data)
print(df_pd.dtypes, end="\n" * 2)
print(df_pd)

integers                 int64
big_integers             int64
floats                 float64
floats_with_decimal    float64
floats_as_string        object
strings_not_float       object
bools                     bool
dtype: object

   integers  big_integers  floats  floats_with_decimal floats_as_string strings_not_float  bools
0         1             1     4.0                4.532              4.0               4.0   True
1         2      10000002     5.0                5.500              5.0      not_a_number  False
2         3             3     6.0                6.500              6.0               6.0   True
3         4      10000004     7.0                7.500              7.0               7.0  False
4         5      10000005     8.0                8.500              8.0               8.0   True

Numerics

out_pl = df_pl.select(
    pl.col("integers").cast(pl.Float32).alias("integers_as_floats"),
    pl.col("floats").cast(pl.Int32).alias("floats_as_integers"),
    pl.col("floats_with_decimal")
    .cast(pl.Int32)
    .alias("floats_with_decimal_as_integers"),
)
print(out_pl)

shape: (5, 3)
┌────────────────────┬────────────────────┬─────────────────────────────────┐
│ integers_as_floats ┆ floats_as_integers ┆ floats_with_decimal_as_integers │
│ ---                ┆ ---                ┆ ---                             │
│ f32                ┆ i32                ┆ i32                             │
╞════════════════════╪════════════════════╪═════════════════════════════════╡
│ 1.0                ┆ 4                  ┆ 4                               │
│ 2.0                ┆ 5                  ┆ 5                               │
│ 3.0                ┆ 6                  ┆ 6                               │
│ 4.0                ┆ 7                  ┆ 7                               │
│ 5.0                ┆ 8                  ┆ 8                               │
└────────────────────┴────────────────────┴─────────────────────────────────┘

out_pd = df_pd.assign(
    integers_as_floats=lambda df_: df_.integers.astype("float32"),
    floats_as_integers=lambda df_: df_.floats.astype("int32"),
    floats_with_decimal_as_integers=lambda df_: df_.floats_with_decimal.astype("int32"),
).drop(columns=df_pd.columns)
print(out_pd.dtypes, end="\n" * 2)
print(out_pd)

integers_as_floats                 float32
floats_as_integers                   int32
floats_with_decimal_as_integers      int32
dtype: object

   integers_as_floats  floats_as_integers  floats_with_decimal_as_integers
0                 1.0                   4                                4
1                 2.0                   5                                5
2                 3.0                   6                                6
3                 4.0                   7                                7
4                 5.0                   8                                8

Downcast

out_pl = df_pl.select(
    pl.col("integers").cast(pl.Int16).alias("integers_smallfootprint"),
    pl.col("floats").cast(pl.Float32).alias("floats_smallfootprint"),
)
print(out_pl)

shape: (5, 2)
┌─────────────────────────┬───────────────────────┐
│ integers_smallfootprint ┆ floats_smallfootprint │
│ ---                     ┆ ---                   │
│ i16                     ┆ f32                   │
╞═════════════════════════╪═══════════════════════╡
│ 1                       ┆ 4.0                   │
│ 2                       ┆ 5.0                   │
│ 3                       ┆ 6.0                   │
│ 4                       ┆ 7.0                   │
│ 5                       ┆ 8.0                   │
└─────────────────────────┴───────────────────────┘

out_pd = df_pd.assign(
    integers_smallfootprint=lambda df_: df_.integers.astype("int16"),
    floats_smallfootprint=lambda df_: df_.floats.astype("float32"),
).drop(columns=df_pd.columns)
print(out_pd.dtypes, end="\n" * 2)
print(out_pd)

integers_smallfootprint      int16
floats_smallfootprint      float32
dtype: object

   integers_smallfootprint  floats_smallfootprint
0                        1                    4.0
1                        2                    5.0
2                        3                    6.0
3                        4                    7.0
4                        5                    8.0

Overflow

strict=True

try:
    out_pl = df_pl.select(pl.col("big_integers").cast(pl.Int8))
    print(out_pl)
except Exception as e:
    print(e)

strict conversion from `i64` to `i8` failed for column: big_integers, value(s) [10000002, 10000004, 10000005]; if you were trying to cast Utf8 to temporal dtypes, consider using `strptime`

strict=False

out_pl = df_pl.select(pl.col("big_integers").cast(pl.Int8, strict=False))
print(out_pl)

shape: (5, 1)
┌──────────────┐
│ big_integers │
│ ---          │
│ i8           │
╞══════════════╡
│ 1            │
│ null         │
│ 3            │
│ null         │
│ null         │
└──────────────┘

pd.Series.astype(..)

This behavior might not be as you expected.

out_pd = df_pd.assign(big_integers=lambda df_: df_.big_integers.astype("int8")).drop(
    columns=df_pd.columns.drop(["big_integers"])
)

print(out_pd.dtypes, end="\n" * 2)
print(out_pd)

big_integers    int8
dtype: object

   big_integers
0             1
1          -126
2             3
3          -124
4          -123

pd.to_numeric(.., downcast=..)

Alternatively, pd.to_numeric will do its best to downcast the resulting data to the smallest numerical dtype as specified in the downcast parameter.

out_pd = df_pd.assign(
    big_integers=lambda df_: pd.to_numeric(df_.big_integers, downcast="integer")
).drop(columns=df_pd.columns.drop(["big_integers"]))

print(out_pd.dtypes, end="\n" * 2)
print(out_pd)

big_integers    int32
dtype: object

   big_integers
0             1
1      10000002
2             3
3      10000004
4      10000005
The type of big_integers is converted from int64 to int32.

Strings

Numeric values

out_pl = df_pl.select(
    pl.col("integers").cast(pl.Utf8),
    pl.col("floats").cast(pl.Utf8),
    pl.col("floats_as_string").cast(pl.Float64),
)
print(out_pl)

shape: (5, 3)
┌──────────┬────────┬──────────────────┐
│ integers ┆ floats ┆ floats_as_string │
│ ---      ┆ ---    ┆ ---              │
│ str      ┆ str    ┆ f64              │
╞══════════╪════════╪══════════════════╡
│ 1        ┆ 4.0    ┆ 4.0              │
│ 2        ┆ 5.0    ┆ 5.0              │
│ 3        ┆ 6.0    ┆ 6.0              │
│ 4        ┆ 7.0    ┆ 7.0              │
│ 5        ┆ 8.0    ┆ 8.0              │
└──────────┴────────┴──────────────────┘

out_pd = df_pd.assign(
    integers=lambda df_: df_.integers.astype(str),
    floats=lambda df_: df_.floats.astype(str),
    floats_as_string=lambda df_: df_.floats_as_string.astype("float64"),
).drop(columns=df_pd.columns.drop(["integers", "floats", "floats_as_string"]))
print(out_pd.dtypes, end="\n" * 2)
print(out_pd)

integers             object
floats               object
floats_as_string    float64
dtype: object

  integers floats  floats_as_string
0        1    4.0               4.0
1        2    5.0               5.0
2        3    6.0               6.0
3        4    7.0               7.0
4        5    8.0               8.0

Non-numeric values

  • The behavior of pl.col().cast(.., strict=True) in Polars behaves similarly to pd.to_numeric(.., errors=raise) in Pandas in this example.

  • The behavior of pl.col().cast(.., strict=False) in Polars behaves similarly to pd.to_numeric(.., errors=coerce) in Pandas in this example.

strict=True

try:
    out_pl = df_pl.select(pl.col("strings_not_float").cast(pl.Float64))
    print(out_pl)
except Exception as e:
    print(e)

strict conversion from `str` to `f64` failed for column: strings_not_float, value(s) ["not_a_number"]; if you were trying to cast Utf8 to temporal dtypes, consider using `strptime`

strict=False

out_pl = df_pl.select(pl.col("strings_not_float").cast(pl.Float64, strict=False))
print(out_pl)

shape: (5, 1)
┌───────────────────┐
│ strings_not_float │
│ ---               │
│ f64               │
╞═══════════════════╡
│ 4.0               │
│ null              │
│ 6.0               │
│ 7.0               │
│ 8.0               │
└───────────────────┘

erros=raise

try:
    out_pd = df_pd.assign(
        strings_not_float=lambda df_: pd.to_numeric(df_.strings_not_float)
    ).drop(columns=df_pd.columns.drop(["strings_not_float"]))
    print(out_pd)
except Exception as e:
    print(e)

Unable to parse string "not_a_number" at position 1

erros=coerce

out_pd = df_pd.assign(
    strings_not_float=lambda df_: pd.to_numeric(df_.strings_not_float, errors="coerce")
).drop(columns=df_pd.columns.drop(["strings_not_float"]))
print(out_pd.dtypes, end="\n" * 2)
print(out_pd)

strings_not_float    float64
dtype: object

   strings_not_float
0                4.0
1                NaN
2                6.0
3                7.0
4                8.0

Booleans

out_pl = df_pl.select(
    pl.col("integers").cast(pl.Boolean), pl.col("floats").cast(pl.Boolean)
)
print(out_pl)

shape: (5, 2)
┌──────────┬────────┐
│ integers ┆ floats │
│ ---      ┆ ---    │
│ bool     ┆ bool   │
╞══════════╪════════╡
│ true     ┆ true   │
│ true     ┆ true   │
│ true     ┆ true   │
│ true     ┆ true   │
│ true     ┆ true   │
└──────────┴────────┘

out_pd = df_pd.assign(
    integers=lambda df_: df_.integers.astype(bool),
    floats=lambda df_: df_.floats.astype(bool),
).drop(columns=df_pd.columns.drop(["integers", "floats"]))
print(out_pd)

   integers  floats
0      True    True
1      True    True
2      True    True
3      True    True
4      True    True

Reference

The examples in this section have been adapted from the Polars user guide.