Skip to content

Missing data

In Polars, missing data is consistently represented as a null value. Additionally, Polars permits the use of Not a Number or NaN values for float columns. It's important to avoid conflating these two concepts.

Setup

import numpy as np
import pandas as pd
import polars as pl

data = {"col1": [1, 2, 3], "col2": [1, None, 9]}

df_pl = pl.DataFrame(data)
print(df_pl)

shape: (3, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ ---  ┆ ---  │
│ i64  ┆ i64  │
╞══════╪══════╡
│ 1    ┆ 1    │
│ 2    ┆ null │
│ 3    ┆ 9    │
└──────┴──────┘

Please note that col2 is of data type float64 due to the presence of a NaN value.

df_pd = pd.DataFrame(data)
print(df_pd.dtypes, end="\n" * 2)
print(df_pd)

col1      int64
col2    float64
dtype: object

   col1  col2
0     1   1.0
1     2   NaN
2     3   9.0

Missing data metadata

Is a missing value

is_null_df_pl = df_pl.select(pl.all().is_null())
print(is_null_df_pl)

shape: (3, 2)
┌───────┬───────┐
│ col1  ┆ col2  │
│ ---   ┆ ---   │
│ bool  ┆ bool  │
╞═══════╪═══════╡
│ false ┆ false │
│ false ┆ true  │
│ false ┆ false │
└───────┴───────┘

is_nan_df_pd = df_pd.isna()
print(is_nan_df_pd)

    col1   col2
0  False  False
1  False   True
2  False  False

Count the missing values

null_count_df_pl = df_pl.null_count()
print(null_count_df_pl)

shape: (1, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ ---  ┆ ---  │
│ u32  ┆ u32  │
╞══════╪══════╡
│ 0    ┆ 1    │
└──────┴──────┘

nan_count_df_pd = df_pd.isna().sum().to_frame().T
print(nan_count_df_pd)

   col1  col2
0     0     1

Filling missing data

Fill with specified literal value

fill_literal_df_pl = df_pl.with_columns(pl.col("col2").fill_null(pl.lit(2)))
print(fill_literal_df_pl)

shape: (3, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ ---  ┆ ---  │
│ i64  ┆ i64  │
╞══════╪══════╡
│ 1    ┆ 1    │
│ 2    ┆ 2    │
│ 3    ┆ 9    │
└──────┴──────┘

fill_literal_df_pd = df_pd.assign(col2=lambda df_: df_.col2.fillna(2))
print(fill_literal_df_pd)

   col1  col2
0     1   1.0
1     2   2.0
2     3   9.0

Fill with a strategy

fill_forward_df_pl = df_pl.with_columns(pl.col("col2").fill_null(strategy="forward"))
print(fill_forward_df_pl)

shape: (3, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ ---  ┆ ---  │
│ i64  ┆ i64  │
╞══════╪══════╡
│ 1    ┆ 1    │
│ 2    ┆ 1    │
│ 3    ┆ 9    │
└──────┴──────┘

fill_forward_df_pd = df_pd.assign(col2=lambda df_: df_.col2.ffill())
print(fill_forward_df_pd)

   col1  col2
0     1   1.0
1     2   1.0
2     3   9.0

Fill with an expression

fill_median_df_pl = df_pl.with_columns(pl.col("col2").fill_null(pl.median("col2")))
print(fill_median_df_pl)

shape: (3, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ ---  ┆ ---  │
│ i64  ┆ f64  │
╞══════╪══════╡
│ 1    ┆ 1.0  │
│ 2    ┆ 5.0  │
│ 3    ┆ 9.0  │
└──────┴──────┘

fill_median_df_pd = df_pd.assign(col2=lambda df_: df_.col2.fillna(df_.col2.median()))
print(fill_median_df_pd)

   col1  col2
0     1   1.0
1     2   5.0
2     3   9.0

Fill with interpolation

fill_interpolation_df_pl = df_pl.with_columns(pl.col("col2").interpolate())
print(fill_interpolation_df_pl)

shape: (3, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ ---  ┆ ---  │
│ i64  ┆ f64  │
╞══════╪══════╡
│ 1    ┆ 1.0  │
│ 2    ┆ 5.0  │
│ 3    ┆ 9.0  │
└──────┴──────┘

fill_interpolation_df_pd = df_pd.assign(col2=lambda df_: df_.col2.interpolate())
print(fill_interpolation_df_pd)

   col1  col2
0     1   1.0
1     2   5.0
2     3   9.0

NaN values

Similar to the null value, Polars has is_nan and fill_nan to work with the NaN value. However, it should be noted that there is no nan_count in Polars.

These NaN values can be created from Numpy's np.nan or the native python float('nan').

nan_df_pl = pl.DataFrame({"value": [1.0, np.NaN, float("nan"), 3.0]})
print(nan_df_pl)
shape: (4, 1)
┌───────┐
│ value │
│ ---   │
│ f64   │
╞═══════╡
│ 1.0   │
│ NaN   │
│ NaN   │
│ 3.0   │
└───────┘

Is a NaN value

is_nan_df_pl = nan_df_pl.select(pl.all().is_nan())
print(is_nan_df_pl)
shape: (4, 1)
┌───────┐
│ value │
│ ---   │
│ bool  │
╞═══════╡
│ false │
│ true  │
│ true  │
│ false │
└───────┘

Count the NaN values

nan_count_df_pl = nan_df_pl.select(pl.all().is_nan()).sum()
print(nan_count_df_pl)
shape: (1, 1)
┌───────┐
│ value │
│ ---   │
│ u32   │
╞═══════╡
│ 2     │
└───────┘

Filling NaN

fill_literal_nan_df_pl = nan_df_pl.with_columns(pl.col("value").fill_nan(pl.lit(2)))
print(fill_literal_nan_df_pl)
shape: (4, 1)
┌───────┐
│ value │
│ ---   │
│ f64   │
╞═══════╡
│ 1.0   │
│ 2.0   │
│ 2.0   │
│ 3.0   │
└───────┘

Calculating the mean and median values

When calculating the mean or median of a column with NaN values, the result will be NaN. To change this behavior, replace NaN values with null values. With this change, null values will be excluded when calculating the mean or median of a column.

mean_nan_fill_null_df_pl = nan_df_pl.with_columns(pl.col("value")).mean()
print(mean_nan_fill_null_df_pl)

One trick to change this behaviour is to replace the NaN values with null values

shape: (1, 1)
┌───────┐
│ value │
│ ---   │
│ f64   │
╞═══════╡
│ NaN   │
└───────┘

mean_nan_fill_null_df_pl = nan_df_pl.with_columns(pl.col("value").fill_nan(None)).mean()
print(mean_nan_fill_null_df_pl)

shape: (1, 1)
┌───────┐
│ value │
│ ---   │
│ f64   │
╞═══════╡
│ 2.0   │
└───────┘

pd.NaT

It's worth noting that Pandas has a special pd.NaT, which serves as the time equivalent of NaN.

df_pd_nat = pd.DataFrame([pd.Timestamp("2023"), np.nan], columns=["col"])
print(df_pd_nat.dtypes, end="\n" * 2)
print(df_pd_nat)
col    datetime64[ns]
dtype: object

         col
0 2023-01-01
1        NaT

More about filling with interpolation

While Polars provides linear and nearest interpolation strategies, Pandas offers a broader range.

data2 = {"col1": np.random.rand(10)}

df_pl2 = (
    pl.DataFrame(data2)
    .with_row_count("row_nr")
    .select(pl.when(~pl.col("row_nr").is_in([4, 5, 6])).then((pl.col("col1"))))
)
print(df_pl2)

shape: (10, 1)
┌──────────┐
│ col1     │
│ ---      │
│ f64      │
╞══════════╡
│ 0.155995 │
│ 0.058084 │
│ 0.866176 │
│ 0.601115 │
│ …        │
│ null     │
│ 0.832443 │
│ 0.212339 │
│ 0.181825 │
└──────────┘

df_pd2 = pd.DataFrame(data2).assign(
    col1=lambda df_: df_.col1.where(~df_.index.isin([4, 5, 6]), np.nan)
)
print(df_pd2)

       col1
0  0.155995
1  0.058084
2  0.866176
3  0.601115
4       NaN
5       NaN
6       NaN
7  0.832443
8  0.212339
9  0.181825

Several interpolation methods in df.interpolation of Pandas are adopted from the SciPy package.

out_pl = df_pl2.with_columns(
    linear=pl.col("col1").interpolate(method="linear"),
    nearest=pl.col("col1").interpolate(method="nearest"),
)
print(out_pl)

shape: (10, 3)
┌──────────┬──────────┬──────────┐
│ col1     ┆ linear   ┆ nearest  │
│ ---      ┆ ---      ┆ ---      │
│ f64      ┆ f64      ┆ f64      │
╞══════════╪══════════╪══════════╡
│ 0.155995 ┆ 0.155995 ┆ 0.155995 │
│ 0.058084 ┆ 0.058084 ┆ 0.058084 │
│ 0.866176 ┆ 0.866176 ┆ 0.866176 │
│ 0.601115 ┆ 0.601115 ┆ 0.601115 │
│ …        ┆ …        ┆ …        │
│ null     ┆ 0.774611 ┆ 0.832443 │
│ 0.832443 ┆ 0.832443 ┆ 0.832443 │
│ 0.212339 ┆ 0.212339 ┆ 0.212339 │
│ 0.181825 ┆ 0.181825 ┆ 0.181825 │
└──────────┴──────────┴──────────┘

out_pd = df_pd2.assign(
    linear=lambda df_: df_.col1.interpolate(method="linear"),
    nearest=lambda df_: df_.col1.interpolate(method="nearest"),
    quadratic=lambda df_: df_.col1.interpolate(method="quadratic"),
    poly_order3=lambda df_: df_.col1.interpolate(method="polynomial", order=3),
    spline_order5=lambda df_: df_.col1.interpolate(method="spline", order=5),
)
print(out_pd)

       col1    linear   nearest  quadratic  poly_order3  spline_order5
0  0.155995  0.155995  0.155995   0.155995     0.155995       0.155995
1  0.058084  0.058084  0.058084   0.058084     0.058084       0.058084
2  0.866176  0.866176  0.866176   0.866176     0.866176       0.866176
3  0.601115  0.601115  0.601115   0.601115     0.601115       0.601115
4       NaN  0.658947  0.601115   0.494938     0.517777       1.084757
5       NaN  0.716779  0.601115   0.815400     0.767544       1.214026
6       NaN  0.774611  0.832443   1.086551     0.991928       1.100660
7  0.832443  0.832443  0.832443   0.832443     0.832443       0.832443
8  0.212339  0.212339  0.212339   0.212339     0.212339       0.212339
9  0.181825  0.181825  0.181825   0.181825     0.181825       0.181825

Reference

The examples in this section have been adapted from the Polars user guide.