Time series
Setup
It is worth noting that:
- The datapoint for
2023-11-05is intentionally excluded in bothdf_planddf_pd. - Some datapoints are deliberately set as
nullinPolarsand asNaNinPandas. - The results of quantiles are not consistent between
PolarsandPandas. This discrepancy arises from the default interpolation method for quantiles, which isnearestinPolarsandlinearinPandas.
from datetime import date, datetime, timedelta
from functools import lru_cache
import numpy as np
import pandas as pd
import polars as pl
from zoneinfo import ZoneInfo
np.random.seed(42)
pl.Config.set_fmt_str_lengths(50)
pl.Config.set_tbl_rows(50)
_tzinfo = "Asia/Taipei"
tzinfo = ZoneInfo(_tzinfo)
@lru_cache
def get_nrs(n):
return np.random.rand(n) * 100
df_pl = (
pl.datetime_range(
date(2023, 11, 1),
date(2023, 11, 8),
timedelta(hours=8),
closed="left",
time_unit="ms",
time_zone=_tzinfo,
eager=True,
)
.to_frame()
.filter(~pl.col("datetime").dt.day().is_in([5]))
.pipe(
lambda df_: df_.with_columns(
nrs=pl.when(~pl.col("datetime").dt.day().is_in([6])).then(
pl.lit(get_nrs(df_.height))
)
)
)
)
print(df_pl)
shape: (18, 2)
┌───────────────────────────┬───────────┐
│ datetime ┆ nrs │
│ --- ┆ --- │
│ datetime[ms, Asia/Taipei] ┆ f64 │
╞═══════════════════════════╪═══════════╡
│ 2023-11-01 00:00:00 CST ┆ 37.454012 │
│ 2023-11-01 08:00:00 CST ┆ 95.071431 │
│ 2023-11-01 16:00:00 CST ┆ 73.199394 │
│ 2023-11-02 00:00:00 CST ┆ 59.865848 │
│ 2023-11-02 08:00:00 CST ┆ 15.601864 │
│ 2023-11-02 16:00:00 CST ┆ 15.599452 │
│ 2023-11-03 00:00:00 CST ┆ 5.808361 │
│ 2023-11-03 08:00:00 CST ┆ 86.617615 │
│ 2023-11-03 16:00:00 CST ┆ 60.111501 │
│ 2023-11-04 00:00:00 CST ┆ 70.807258 │
│ 2023-11-04 08:00:00 CST ┆ 2.058449 │
│ 2023-11-04 16:00:00 CST ┆ 96.990985 │
│ 2023-11-06 00:00:00 CST ┆ null │
│ 2023-11-06 08:00:00 CST ┆ null │
│ 2023-11-06 16:00:00 CST ┆ null │
│ 2023-11-07 00:00:00 CST ┆ 18.340451 │
│ 2023-11-07 08:00:00 CST ┆ 30.424224 │
│ 2023-11-07 16:00:00 CST ┆ 52.475643 │
└───────────────────────────┴───────────┘
df_pd = (
pd.DataFrame(
{
"datetime": pd.date_range(
"2023-11-01", "2023-11-08", freq="8h", inclusive="left", tz=_tzinfo
)
}
)
.query("~datetime.dt.day.isin([5])")
.assign(
nrs=lambda df_: np.where(
~df_.datetime.dt.day.isin([6]), get_nrs(df_.shape[0]), np.nan
)
)
.set_index("datetime")
)
print(df_pd.shape)
print(df_pd)
(18, 1)
nrs
datetime
2023-11-01 00:00:00+08:00 37.454012
2023-11-01 08:00:00+08:00 95.071431
2023-11-01 16:00:00+08:00 73.199394
2023-11-02 00:00:00+08:00 59.865848
2023-11-02 08:00:00+08:00 15.601864
2023-11-02 16:00:00+08:00 15.599452
2023-11-03 00:00:00+08:00 5.808361
2023-11-03 08:00:00+08:00 86.617615
2023-11-03 16:00:00+08:00 60.111501
2023-11-04 00:00:00+08:00 70.807258
2023-11-04 08:00:00+08:00 2.058449
2023-11-04 16:00:00+08:00 96.990985
2023-11-06 00:00:00+08:00 NaN
2023-11-06 08:00:00+08:00 NaN
2023-11-06 16:00:00+08:00 NaN
2023-11-07 00:00:00+08:00 18.340451
2023-11-07 08:00:00+08:00 30.424224
2023-11-07 16:00:00+08:00 52.475643
shape: (9, 3)
┌────────────┬───────────────────────────┬───────────┐
│ describe ┆ datetime ┆ nrs │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ f64 │
╞════════════╪═══════════════════════════╪═══════════╡
│ count ┆ 18 ┆ 18.0 │
│ null_count ┆ 0 ┆ 3.0 │
│ mean ┆ null ┆ 48.028433 │
│ std ┆ null ┆ 32.579411 │
│ min ┆ 2023-11-01 00:00:00+08:00 ┆ 2.058449 │
│ 25% ┆ null ┆ 15.601864 │
│ 50% ┆ null ┆ 52.475643 │
│ 75% ┆ null ┆ 73.199394 │
│ max ┆ 2023-11-07 16:00:00+08:00 ┆ 96.990985 │
└────────────┴───────────────────────────┴───────────┘
Slice time series
Polars relies on expressions to slice time series data, while Pandas can utilize the DatetimeIndex.
t_start = datetime(2023, 11, 1, tzinfo=tzinfo)
t_end = datetime(2023, 11, 4, tzinfo=tzinfo)
out_pl = df_pl.filter(pl.col("datetime").is_between(t_start, t_end, closed="left"))
print(out_pl)
shape: (9, 2)
┌───────────────────────────┬───────────┐
│ datetime ┆ nrs │
│ --- ┆ --- │
│ datetime[ms, Asia/Taipei] ┆ f64 │
╞═══════════════════════════╪═══════════╡
│ 2023-11-01 00:00:00 CST ┆ 37.454012 │
│ 2023-11-01 08:00:00 CST ┆ 95.071431 │
│ 2023-11-01 16:00:00 CST ┆ 73.199394 │
│ 2023-11-02 00:00:00 CST ┆ 59.865848 │
│ 2023-11-02 08:00:00 CST ┆ 15.601864 │
│ 2023-11-02 16:00:00 CST ┆ 15.599452 │
│ 2023-11-03 00:00:00 CST ┆ 5.808361 │
│ 2023-11-03 08:00:00 CST ┆ 86.617615 │
│ 2023-11-03 16:00:00 CST ┆ 60.111501 │
└───────────────────────────┴───────────┘
(9, 1)
nrs
datetime
2023-11-01 00:00:00+08:00 37.454012
2023-11-01 08:00:00+08:00 95.071431
2023-11-01 16:00:00+08:00 73.199394
2023-11-02 00:00:00+08:00 59.865848
2023-11-02 08:00:00+08:00 15.601864
2023-11-02 16:00:00+08:00 15.599452
2023-11-03 00:00:00+08:00 5.808361
2023-11-03 08:00:00+08:00 86.617615
2023-11-03 16:00:00+08:00 60.111501
Resampling data by day
Polars employs df.group_by_dynamic for time series resampling, while Pandas provides the df.resample method. If you need continuous datetime datapoints, similar to Pandas, you may find df.upsample in Polars to be useful.
shape: (6, 2)
┌───────────────────────────┬───────────┐
│ datetime ┆ nrs │
│ --- ┆ --- │
│ datetime[ms, Asia/Taipei] ┆ f64 │
╞═══════════════════════════╪═══════════╡
│ 2023-11-01 00:00:00 CST ┆ 68.574946 │
│ 2023-11-02 00:00:00 CST ┆ 30.355721 │
│ 2023-11-03 00:00:00 CST ┆ 50.845826 │
│ 2023-11-04 00:00:00 CST ┆ 56.618897 │
│ 2023-11-06 00:00:00 CST ┆ null │
│ 2023-11-07 00:00:00 CST ┆ 33.746773 │
└───────────────────────────┴───────────┘
out_pl = (
df_pl.group_by_dynamic("datetime", every="1d")
.agg(pl.col("nrs").mean())
.upsample("datetime", every="1d")
)
print(out_pl)
shape: (7, 2)
┌───────────────────────────┬───────────┐
│ datetime ┆ nrs │
│ --- ┆ --- │
│ datetime[ms, Asia/Taipei] ┆ f64 │
╞═══════════════════════════╪═══════════╡
│ 2023-11-01 00:00:00 CST ┆ 68.574946 │
│ 2023-11-02 00:00:00 CST ┆ 30.355721 │
│ 2023-11-03 00:00:00 CST ┆ 50.845826 │
│ 2023-11-04 00:00:00 CST ┆ 56.618897 │
│ 2023-11-05 00:00:00 CST ┆ null │
│ 2023-11-06 00:00:00 CST ┆ null │
│ 2023-11-07 00:00:00 CST ┆ 33.746773 │
└───────────────────────────┴───────────┘
Resampling data by custom time interval
At the time of writing, a complex combination for datetime, like 3d12h4m25s will fail. In addition, to achieve the similar behavior like Pandas, you might need to tweak the offset or start_by parameters of df.group_by_dynamic in Polars.
out_pl = (
df_pl.group_by_dynamic("datetime", every="16h30m", offset="4h30m")
.agg(pl.col("nrs").mean())
.upsample("datetime", every="16h30m")
)
print(out_pl)
shape: (10, 2)
┌───────────────────────────┬───────────┐
│ datetime ┆ nrs │
│ --- ┆ --- │
│ datetime[ms, Asia/Taipei] ┆ f64 │
╞═══════════════════════════╪═══════════╡
│ 2023-11-01 00:00:00 CST ┆ 68.574946 │
│ 2023-11-01 16:30:00 CST ┆ 37.733856 │
│ 2023-11-02 09:00:00 CST ┆ 10.703907 │
│ 2023-11-03 01:30:00 CST ┆ 73.364558 │
│ 2023-11-03 18:00:00 CST ┆ 36.432854 │
│ 2023-11-04 10:30:00 CST ┆ 96.990985 │
│ 2023-11-05 03:00:00 CST ┆ null │
│ 2023-11-05 19:30:00 CST ┆ null │
│ 2023-11-06 12:00:00 CST ┆ 18.340451 │
│ 2023-11-07 04:30:00 CST ┆ 41.449934 │
└───────────────────────────┴───────────┘
out_pl = (
df_pl.group_by_dynamic("datetime", every="16h30m", start_by="datapoint")
.agg(pl.col("nrs").mean())
.upsample("datetime", every="16h30m")
)
print(out_pl)
shape: (10, 2)
┌───────────────────────────┬───────────┐
│ datetime ┆ nrs │
│ --- ┆ --- │
│ datetime[ms, Asia/Taipei] ┆ f64 │
╞═══════════════════════════╪═══════════╡
│ 2023-11-01 00:00:00 CST ┆ 68.574946 │
│ 2023-11-01 16:30:00 CST ┆ 37.733856 │
│ 2023-11-02 09:00:00 CST ┆ 10.703907 │
│ 2023-11-03 01:30:00 CST ┆ 73.364558 │
│ 2023-11-03 18:00:00 CST ┆ 36.432854 │
│ 2023-11-04 10:30:00 CST ┆ 96.990985 │
│ 2023-11-05 03:00:00 CST ┆ null │
│ 2023-11-05 19:30:00 CST ┆ null │
│ 2023-11-06 12:00:00 CST ┆ 18.340451 │
│ 2023-11-07 04:30:00 CST ┆ 41.449934 │
└───────────────────────────┴───────────┘
(10, 1)
nrs
datetime
2023-11-01 00:00:00+08:00 68.574946
2023-11-01 16:30:00+08:00 37.733856
2023-11-02 09:00:00+08:00 10.703907
2023-11-03 01:30:00+08:00 73.364558
2023-11-03 18:00:00+08:00 36.432854
2023-11-04 10:30:00+08:00 96.990985
2023-11-05 03:00:00+08:00 NaN
2023-11-05 19:30:00+08:00 NaN
2023-11-06 12:00:00+08:00 18.340451
2023-11-07 04:30:00+08:00 41.449934
Resampling data by custom ending
We created new dataframes with more data points for Polars and Pandas, respectively, to provide a clearer illustration.
df_pl2 = (
pl.datetime_range(
date(2023, 1, 1),
date(2025, 1, 1),
timedelta(hours=12),
closed="left",
time_unit="ms",
time_zone=_tzinfo,
eager=True,
)
.to_frame()
.pipe(lambda df_: df_.with_columns(nrs=pl.lit(get_nrs(df_.height))))
)
with pl.Config(tbl_rows=10):
print(df_pl2)
shape: (1_462, 2)
┌───────────────────────────┬───────────┐
│ datetime ┆ nrs │
│ --- ┆ --- │
│ datetime[ms, Asia/Taipei] ┆ f64 │
╞═══════════════════════════╪═══════════╡
│ 2023-01-01 00:00:00 CST ┆ 43.194502 │
│ 2023-01-01 12:00:00 CST ┆ 29.122914 │
│ 2023-01-02 00:00:00 CST ┆ 61.185289 │
│ 2023-01-02 12:00:00 CST ┆ 13.949386 │
│ 2023-01-03 00:00:00 CST ┆ 29.214465 │
│ … ┆ … │
│ 2024-12-29 12:00:00 CST ┆ 66.759339 │
│ 2024-12-30 00:00:00 CST ┆ 80.784594 │
│ 2024-12-30 12:00:00 CST ┆ 76.228513 │
│ 2024-12-31 00:00:00 CST ┆ 79.781365 │
│ 2024-12-31 12:00:00 CST ┆ 43.558331 │
└───────────────────────────┴───────────┘
df_pd2 = (
pd.DataFrame(
{
"datetime": pd.date_range(
"2023-01-01", "2025-01-01", freq="12h", inclusive="left", tz=_tzinfo
)
}
)
.assign(nrs=lambda df_: get_nrs(df_.shape[0]))
.set_index("datetime")
)
print(df_pd2.shape)
print(df_pd2)
(1462, 1)
nrs
datetime
2023-01-01 00:00:00+08:00 43.194502
2023-01-01 12:00:00+08:00 29.122914
2023-01-02 00:00:00+08:00 61.185289
2023-01-02 12:00:00+08:00 13.949386
2023-01-03 00:00:00+08:00 29.214465
... ...
2024-12-29 12:00:00+08:00 66.759339
2024-12-30 00:00:00+08:00 80.784594
2024-12-30 12:00:00+08:00 76.228513
2024-12-31 00:00:00+08:00 79.781365
2024-12-31 12:00:00+08:00 43.558331
[1462 rows x 1 columns]
Achieving this can be done using anchored offset aliases in Pandas. To replicate this behavior in Polars, you might need to make some adjustments.
out_pl = (
df_pl2.group_by_dynamic("datetime", every="1q", offset="-2mo", label="right")
.agg(pl.col("nrs").mean())
.with_columns(pl.col("datetime").dt.offset_by("-1d"))
)
print(out_pl)
shape: (9, 2)
┌───────────────────────────┬───────────┐
│ datetime ┆ nrs │
│ --- ┆ --- │
│ datetime[ms, Asia/Taipei] ┆ f64 │
╞═══════════════════════════╪═══════════╡
│ 2023-01-31 00:00:00 CST ┆ 46.516676 │
│ 2023-04-30 00:00:00 CST ┆ 50.302621 │
│ 2023-07-31 00:00:00 CST ┆ 49.712105 │
│ 2023-10-31 00:00:00 CST ┆ 51.120126 │
│ 2024-01-31 00:00:00 CST ┆ 49.375678 │
│ 2024-04-30 00:00:00 CST ┆ 45.447058 │
│ 2024-07-31 00:00:00 CST ┆ 54.128707 │
│ 2024-10-31 00:00:00 CST ┆ 49.911911 │
│ 2025-01-31 00:00:00 CST ┆ 52.447939 │
└───────────────────────────┴───────────┘
(9, 1)
nrs
datetime
2023-01-31 00:00:00+08:00 46.516676
2023-04-30 00:00:00+08:00 50.302621
2023-07-31 00:00:00+08:00 49.712105
2023-10-31 00:00:00+08:00 51.120126
2024-01-31 00:00:00+08:00 49.375678
2024-04-30 00:00:00+08:00 45.447058
2024-07-31 00:00:00+08:00 54.128707
2024-10-31 00:00:00+08:00 49.911911
2025-01-31 00:00:00+08:00 52.447939