Skip to content

Time series

Setup

It is worth noting that:

  • The datapoint for 2023-11-05 is intentionally excluded in both df_pl and df_pd.
  • Some datapoints are deliberately set as null in Polars and as NaN in Pandas.
  • The results of quantiles are not consistent between Polars and Pandas. This discrepancy arises from the default interpolation method for quantiles, which is nearest in Polars and linear in Pandas.
from datetime import date, datetime, timedelta
from functools import lru_cache

import numpy as np
import pandas as pd
import polars as pl
from zoneinfo import ZoneInfo

np.random.seed(42)
pl.Config.set_fmt_str_lengths(50)
pl.Config.set_tbl_rows(50)
_tzinfo = "Asia/Taipei"
tzinfo = ZoneInfo(_tzinfo)


@lru_cache
def get_nrs(n):
    return np.random.rand(n) * 100

df_pl = (
    pl.datetime_range(
        date(2023, 11, 1),
        date(2023, 11, 8),
        timedelta(hours=8),
        closed="left",
        time_unit="ms",
        time_zone=_tzinfo,
        eager=True,
    )
    .to_frame()
    .filter(~pl.col("datetime").dt.day().is_in([5]))
    .pipe(
        lambda df_: df_.with_columns(
            nrs=pl.when(~pl.col("datetime").dt.day().is_in([6])).then(
                pl.lit(get_nrs(df_.height))
            )
        )
    )
)

print(df_pl)

shape: (18, 2)
┌───────────────────────────┬───────────┐
│ datetime                  ┆ nrs       │
│ ---                       ┆ ---       │
│ datetime[ms, Asia/Taipei] ┆ f64       │
╞═══════════════════════════╪═══════════╡
│ 2023-11-01 00:00:00 CST   ┆ 37.454012 │
│ 2023-11-01 08:00:00 CST   ┆ 95.071431 │
│ 2023-11-01 16:00:00 CST   ┆ 73.199394 │
│ 2023-11-02 00:00:00 CST   ┆ 59.865848 │
│ 2023-11-02 08:00:00 CST   ┆ 15.601864 │
│ 2023-11-02 16:00:00 CST   ┆ 15.599452 │
│ 2023-11-03 00:00:00 CST   ┆ 5.808361  │
│ 2023-11-03 08:00:00 CST   ┆ 86.617615 │
│ 2023-11-03 16:00:00 CST   ┆ 60.111501 │
│ 2023-11-04 00:00:00 CST   ┆ 70.807258 │
│ 2023-11-04 08:00:00 CST   ┆ 2.058449  │
│ 2023-11-04 16:00:00 CST   ┆ 96.990985 │
│ 2023-11-06 00:00:00 CST   ┆ null      │
│ 2023-11-06 08:00:00 CST   ┆ null      │
│ 2023-11-06 16:00:00 CST   ┆ null      │
│ 2023-11-07 00:00:00 CST   ┆ 18.340451 │
│ 2023-11-07 08:00:00 CST   ┆ 30.424224 │
│ 2023-11-07 16:00:00 CST   ┆ 52.475643 │
└───────────────────────────┴───────────┘

df_pd = (
    pd.DataFrame(
        {
            "datetime": pd.date_range(
                "2023-11-01", "2023-11-08", freq="8h", inclusive="left", tz=_tzinfo
            )
        }
    )
    .query("~datetime.dt.day.isin([5])")
    .assign(
        nrs=lambda df_: np.where(
            ~df_.datetime.dt.day.isin([6]), get_nrs(df_.shape[0]), np.nan
        )
    )
    .set_index("datetime")
)
print(df_pd.shape)
print(df_pd)

(18, 1)
                                 nrs
datetime                            
2023-11-01 00:00:00+08:00  37.454012
2023-11-01 08:00:00+08:00  95.071431
2023-11-01 16:00:00+08:00  73.199394
2023-11-02 00:00:00+08:00  59.865848
2023-11-02 08:00:00+08:00  15.601864
2023-11-02 16:00:00+08:00  15.599452
2023-11-03 00:00:00+08:00   5.808361
2023-11-03 08:00:00+08:00  86.617615
2023-11-03 16:00:00+08:00  60.111501
2023-11-04 00:00:00+08:00  70.807258
2023-11-04 08:00:00+08:00   2.058449
2023-11-04 16:00:00+08:00  96.990985
2023-11-06 00:00:00+08:00        NaN
2023-11-06 08:00:00+08:00        NaN
2023-11-06 16:00:00+08:00        NaN
2023-11-07 00:00:00+08:00  18.340451
2023-11-07 08:00:00+08:00  30.424224
2023-11-07 16:00:00+08:00  52.475643

print(df_pl.describe())

shape: (9, 3)
┌────────────┬───────────────────────────┬───────────┐
│ describe   ┆ datetime                  ┆ nrs       │
│ ---        ┆ ---                       ┆ ---       │
│ str        ┆ str                       ┆ f64       │
╞════════════╪═══════════════════════════╪═══════════╡
│ count      ┆ 18                        ┆ 18.0      │
│ null_count ┆ 0                         ┆ 3.0       │
│ mean       ┆ null                      ┆ 48.028433 │
│ std        ┆ null                      ┆ 32.579411 │
│ min        ┆ 2023-11-01 00:00:00+08:00 ┆ 2.058449  │
│ 25%        ┆ null                      ┆ 15.601864 │
│ 50%        ┆ null                      ┆ 52.475643 │
│ 75%        ┆ null                      ┆ 73.199394 │
│ max        ┆ 2023-11-07 16:00:00+08:00 ┆ 96.990985 │
└────────────┴───────────────────────────┴───────────┘

print(df_pd.reset_index().describe(include="all"))

                        datetime        nrs
count                         18  15.000000
mean   2023-11-04 04:00:00+08:00  48.028433
min    2023-11-01 00:00:00+08:00   2.058449
25%    2023-11-02 10:00:00+08:00  16.971158
50%    2023-11-03 20:00:00+08:00  52.475643
75%    2023-11-06 06:00:00+08:00  72.003326
max    2023-11-07 16:00:00+08:00  96.990985
std                          NaN  32.579411

Slice time series

Polars relies on expressions to slice time series data, while Pandas can utilize the DatetimeIndex.

t_start = datetime(2023, 11, 1, tzinfo=tzinfo)
t_end = datetime(2023, 11, 4, tzinfo=tzinfo)
out_pl = df_pl.filter(pl.col("datetime").is_between(t_start, t_end, closed="left"))
print(out_pl)

shape: (9, 2)
┌───────────────────────────┬───────────┐
│ datetime                  ┆ nrs       │
│ ---                       ┆ ---       │
│ datetime[ms, Asia/Taipei] ┆ f64       │
╞═══════════════════════════╪═══════════╡
│ 2023-11-01 00:00:00 CST   ┆ 37.454012 │
│ 2023-11-01 08:00:00 CST   ┆ 95.071431 │
│ 2023-11-01 16:00:00 CST   ┆ 73.199394 │
│ 2023-11-02 00:00:00 CST   ┆ 59.865848 │
│ 2023-11-02 08:00:00 CST   ┆ 15.601864 │
│ 2023-11-02 16:00:00 CST   ┆ 15.599452 │
│ 2023-11-03 00:00:00 CST   ┆ 5.808361  │
│ 2023-11-03 08:00:00 CST   ┆ 86.617615 │
│ 2023-11-03 16:00:00 CST   ┆ 60.111501 │
└───────────────────────────┴───────────┘

out_pd = df_pd.loc["2023-11-01":"2023-11-03", :]
print(out_pd.shape)
print(out_pd)

(9, 1)
                                 nrs
datetime                            
2023-11-01 00:00:00+08:00  37.454012
2023-11-01 08:00:00+08:00  95.071431
2023-11-01 16:00:00+08:00  73.199394
2023-11-02 00:00:00+08:00  59.865848
2023-11-02 08:00:00+08:00  15.601864
2023-11-02 16:00:00+08:00  15.599452
2023-11-03 00:00:00+08:00   5.808361
2023-11-03 08:00:00+08:00  86.617615
2023-11-03 16:00:00+08:00  60.111501

Resampling data by day

Polars employs df.group_by_dynamic for time series resampling, while Pandas provides the df.resample method. If you need continuous datetime datapoints, similar to Pandas, you may find df.upsample in Polars to be useful.

out_pl = df_pl.group_by_dynamic("datetime", every="1d").agg(pl.col("nrs").mean())
print(out_pl)

shape: (6, 2)
┌───────────────────────────┬───────────┐
│ datetime                  ┆ nrs       │
│ ---                       ┆ ---       │
│ datetime[ms, Asia/Taipei] ┆ f64       │
╞═══════════════════════════╪═══════════╡
│ 2023-11-01 00:00:00 CST   ┆ 68.574946 │
│ 2023-11-02 00:00:00 CST   ┆ 30.355721 │
│ 2023-11-03 00:00:00 CST   ┆ 50.845826 │
│ 2023-11-04 00:00:00 CST   ┆ 56.618897 │
│ 2023-11-06 00:00:00 CST   ┆ null      │
│ 2023-11-07 00:00:00 CST   ┆ 33.746773 │
└───────────────────────────┴───────────┘

out_pl = (
    df_pl.group_by_dynamic("datetime", every="1d")
    .agg(pl.col("nrs").mean())
    .upsample("datetime", every="1d")
)
print(out_pl)

shape: (7, 2)
┌───────────────────────────┬───────────┐
│ datetime                  ┆ nrs       │
│ ---                       ┆ ---       │
│ datetime[ms, Asia/Taipei] ┆ f64       │
╞═══════════════════════════╪═══════════╡
│ 2023-11-01 00:00:00 CST   ┆ 68.574946 │
│ 2023-11-02 00:00:00 CST   ┆ 30.355721 │
│ 2023-11-03 00:00:00 CST   ┆ 50.845826 │
│ 2023-11-04 00:00:00 CST   ┆ 56.618897 │
│ 2023-11-05 00:00:00 CST   ┆ null      │
│ 2023-11-06 00:00:00 CST   ┆ null      │
│ 2023-11-07 00:00:00 CST   ┆ 33.746773 │
└───────────────────────────┴───────────┘

out_pd = df_pd.resample("D").mean()
print(out_pd.shape)
print(out_pd)

(7, 1)
                                 nrs
datetime                            
2023-11-01 00:00:00+08:00  68.574946
2023-11-02 00:00:00+08:00  30.355721
2023-11-03 00:00:00+08:00  50.845826
2023-11-04 00:00:00+08:00  56.618897
2023-11-05 00:00:00+08:00        NaN
2023-11-06 00:00:00+08:00        NaN
2023-11-07 00:00:00+08:00  33.746773

Resampling data by custom time interval

At the time of writing, a complex combination for datetime, like 3d12h4m25s will fail. In addition, to achieve the similar behavior like Pandas, you might need to tweak the offset or start_by parameters of df.group_by_dynamic in Polars.

out_pl = (
    df_pl.group_by_dynamic("datetime", every="16h30m", offset="4h30m")
    .agg(pl.col("nrs").mean())
    .upsample("datetime", every="16h30m")
)
print(out_pl)

shape: (10, 2)
┌───────────────────────────┬───────────┐
│ datetime                  ┆ nrs       │
│ ---                       ┆ ---       │
│ datetime[ms, Asia/Taipei] ┆ f64       │
╞═══════════════════════════╪═══════════╡
│ 2023-11-01 00:00:00 CST   ┆ 68.574946 │
│ 2023-11-01 16:30:00 CST   ┆ 37.733856 │
│ 2023-11-02 09:00:00 CST   ┆ 10.703907 │
│ 2023-11-03 01:30:00 CST   ┆ 73.364558 │
│ 2023-11-03 18:00:00 CST   ┆ 36.432854 │
│ 2023-11-04 10:30:00 CST   ┆ 96.990985 │
│ 2023-11-05 03:00:00 CST   ┆ null      │
│ 2023-11-05 19:30:00 CST   ┆ null      │
│ 2023-11-06 12:00:00 CST   ┆ 18.340451 │
│ 2023-11-07 04:30:00 CST   ┆ 41.449934 │
└───────────────────────────┴───────────┘

out_pl = (
    df_pl.group_by_dynamic("datetime", every="16h30m", start_by="datapoint")
    .agg(pl.col("nrs").mean())
    .upsample("datetime", every="16h30m")
)
print(out_pl)

shape: (10, 2)
┌───────────────────────────┬───────────┐
│ datetime                  ┆ nrs       │
│ ---                       ┆ ---       │
│ datetime[ms, Asia/Taipei] ┆ f64       │
╞═══════════════════════════╪═══════════╡
│ 2023-11-01 00:00:00 CST   ┆ 68.574946 │
│ 2023-11-01 16:30:00 CST   ┆ 37.733856 │
│ 2023-11-02 09:00:00 CST   ┆ 10.703907 │
│ 2023-11-03 01:30:00 CST   ┆ 73.364558 │
│ 2023-11-03 18:00:00 CST   ┆ 36.432854 │
│ 2023-11-04 10:30:00 CST   ┆ 96.990985 │
│ 2023-11-05 03:00:00 CST   ┆ null      │
│ 2023-11-05 19:30:00 CST   ┆ null      │
│ 2023-11-06 12:00:00 CST   ┆ 18.340451 │
│ 2023-11-07 04:30:00 CST   ┆ 41.449934 │
└───────────────────────────┴───────────┘

out_pd = df_pd.resample("16H30min").mean()
print(out_pd.shape)
print(out_pd)

(10, 1)
                                 nrs
datetime                            
2023-11-01 00:00:00+08:00  68.574946
2023-11-01 16:30:00+08:00  37.733856
2023-11-02 09:00:00+08:00  10.703907
2023-11-03 01:30:00+08:00  73.364558
2023-11-03 18:00:00+08:00  36.432854
2023-11-04 10:30:00+08:00  96.990985
2023-11-05 03:00:00+08:00        NaN
2023-11-05 19:30:00+08:00        NaN
2023-11-06 12:00:00+08:00  18.340451
2023-11-07 04:30:00+08:00  41.449934

Resampling data by custom ending

We created new dataframes with more data points for Polars and Pandas, respectively, to provide a clearer illustration.

df_pl2 = (
    pl.datetime_range(
        date(2023, 1, 1),
        date(2025, 1, 1),
        timedelta(hours=12),
        closed="left",
        time_unit="ms",
        time_zone=_tzinfo,
        eager=True,
    )
    .to_frame()
    .pipe(lambda df_: df_.with_columns(nrs=pl.lit(get_nrs(df_.height))))
)
with pl.Config(tbl_rows=10):
    print(df_pl2)

shape: (1_462, 2)
┌───────────────────────────┬───────────┐
│ datetime                  ┆ nrs       │
│ ---                       ┆ ---       │
│ datetime[ms, Asia/Taipei] ┆ f64       │
╞═══════════════════════════╪═══════════╡
│ 2023-01-01 00:00:00 CST   ┆ 43.194502 │
│ 2023-01-01 12:00:00 CST   ┆ 29.122914 │
│ 2023-01-02 00:00:00 CST   ┆ 61.185289 │
│ 2023-01-02 12:00:00 CST   ┆ 13.949386 │
│ 2023-01-03 00:00:00 CST   ┆ 29.214465 │
│ …                         ┆ …         │
│ 2024-12-29 12:00:00 CST   ┆ 66.759339 │
│ 2024-12-30 00:00:00 CST   ┆ 80.784594 │
│ 2024-12-30 12:00:00 CST   ┆ 76.228513 │
│ 2024-12-31 00:00:00 CST   ┆ 79.781365 │
│ 2024-12-31 12:00:00 CST   ┆ 43.558331 │
└───────────────────────────┴───────────┘

df_pd2 = (
    pd.DataFrame(
        {
            "datetime": pd.date_range(
                "2023-01-01", "2025-01-01", freq="12h", inclusive="left", tz=_tzinfo
            )
        }
    )
    .assign(nrs=lambda df_: get_nrs(df_.shape[0]))
    .set_index("datetime")
)
print(df_pd2.shape)
print(df_pd2)

(1462, 1)
                                 nrs
datetime                            
2023-01-01 00:00:00+08:00  43.194502
2023-01-01 12:00:00+08:00  29.122914
2023-01-02 00:00:00+08:00  61.185289
2023-01-02 12:00:00+08:00  13.949386
2023-01-03 00:00:00+08:00  29.214465
...                              ...
2024-12-29 12:00:00+08:00  66.759339
2024-12-30 00:00:00+08:00  80.784594
2024-12-30 12:00:00+08:00  76.228513
2024-12-31 00:00:00+08:00  79.781365
2024-12-31 12:00:00+08:00  43.558331

[1462 rows x 1 columns]

Achieving this can be done using anchored offset aliases in Pandas. To replicate this behavior in Polars, you might need to make some adjustments.

out_pl = (
    df_pl2.group_by_dynamic("datetime", every="1q", offset="-2mo", label="right")
    .agg(pl.col("nrs").mean())
    .with_columns(pl.col("datetime").dt.offset_by("-1d"))
)
print(out_pl)

shape: (9, 2)
┌───────────────────────────┬───────────┐
│ datetime                  ┆ nrs       │
│ ---                       ┆ ---       │
│ datetime[ms, Asia/Taipei] ┆ f64       │
╞═══════════════════════════╪═══════════╡
│ 2023-01-31 00:00:00 CST   ┆ 46.516676 │
│ 2023-04-30 00:00:00 CST   ┆ 50.302621 │
│ 2023-07-31 00:00:00 CST   ┆ 49.712105 │
│ 2023-10-31 00:00:00 CST   ┆ 51.120126 │
│ 2024-01-31 00:00:00 CST   ┆ 49.375678 │
│ 2024-04-30 00:00:00 CST   ┆ 45.447058 │
│ 2024-07-31 00:00:00 CST   ┆ 54.128707 │
│ 2024-10-31 00:00:00 CST   ┆ 49.911911 │
│ 2025-01-31 00:00:00 CST   ┆ 52.447939 │
└───────────────────────────┴───────────┘

out_pd = df_pd2.resample("Q-JAN").mean()
print(out_pd.shape)
print(out_pd)

(9, 1)
                                 nrs
datetime                            
2023-01-31 00:00:00+08:00  46.516676
2023-04-30 00:00:00+08:00  50.302621
2023-07-31 00:00:00+08:00  49.712105
2023-10-31 00:00:00+08:00  51.120126
2024-01-31 00:00:00+08:00  49.375678
2024-04-30 00:00:00+08:00  45.447058
2024-07-31 00:00:00+08:00  54.128707
2024-10-31 00:00:00+08:00  49.911911
2025-01-31 00:00:00+08:00  52.447939