Time series
Setup
It is worth noting that:
- The datapoint for
2023-11-05
is intentionally excluded in bothdf_pl
anddf_pd
. - Some datapoints are deliberately set as
null
inPolars
and asNaN
inPandas
. - The results of quantiles are not consistent between
Polars
andPandas
. This discrepancy arises from the default interpolation method for quantiles, which isnearest
inPolars
andlinear
inPandas
.
from datetime import date, datetime, timedelta
from functools import lru_cache
import numpy as np
import pandas as pd
import polars as pl
from zoneinfo import ZoneInfo
np.random.seed(42)
pl.Config.set_fmt_str_lengths(50)
pl.Config.set_tbl_rows(50)
_tzinfo = "Asia/Taipei"
tzinfo = ZoneInfo(_tzinfo)
@lru_cache
def get_nrs(n):
return np.random.rand(n) * 100
df_pl = (
pl.datetime_range(
date(2023, 11, 1),
date(2023, 11, 8),
timedelta(hours=8),
closed="left",
time_unit="ms",
time_zone=_tzinfo,
eager=True,
)
.to_frame()
.filter(~pl.col("datetime").dt.day().is_in([5]))
.pipe(
lambda df_: df_.with_columns(
nrs=pl.when(~pl.col("datetime").dt.day().is_in([6])).then(
pl.lit(get_nrs(df_.height))
)
)
)
)
print(df_pl)
shape: (18, 2)
┌───────────────────────────┬───────────┐
│ datetime ┆ nrs │
│ --- ┆ --- │
│ datetime[ms, Asia/Taipei] ┆ f64 │
╞═══════════════════════════╪═══════════╡
│ 2023-11-01 00:00:00 CST ┆ 37.454012 │
│ 2023-11-01 08:00:00 CST ┆ 95.071431 │
│ 2023-11-01 16:00:00 CST ┆ 73.199394 │
│ 2023-11-02 00:00:00 CST ┆ 59.865848 │
│ 2023-11-02 08:00:00 CST ┆ 15.601864 │
│ 2023-11-02 16:00:00 CST ┆ 15.599452 │
│ 2023-11-03 00:00:00 CST ┆ 5.808361 │
│ 2023-11-03 08:00:00 CST ┆ 86.617615 │
│ 2023-11-03 16:00:00 CST ┆ 60.111501 │
│ 2023-11-04 00:00:00 CST ┆ 70.807258 │
│ 2023-11-04 08:00:00 CST ┆ 2.058449 │
│ 2023-11-04 16:00:00 CST ┆ 96.990985 │
│ 2023-11-06 00:00:00 CST ┆ null │
│ 2023-11-06 08:00:00 CST ┆ null │
│ 2023-11-06 16:00:00 CST ┆ null │
│ 2023-11-07 00:00:00 CST ┆ 18.340451 │
│ 2023-11-07 08:00:00 CST ┆ 30.424224 │
│ 2023-11-07 16:00:00 CST ┆ 52.475643 │
└───────────────────────────┴───────────┘
df_pd = (
pd.DataFrame(
{
"datetime": pd.date_range(
"2023-11-01", "2023-11-08", freq="8h", inclusive="left", tz=_tzinfo
)
}
)
.query("~datetime.dt.day.isin([5])")
.assign(
nrs=lambda df_: np.where(
~df_.datetime.dt.day.isin([6]), get_nrs(df_.shape[0]), np.nan
)
)
.set_index("datetime")
)
print(df_pd.shape)
print(df_pd)
(18, 1)
nrs
datetime
2023-11-01 00:00:00+08:00 37.454012
2023-11-01 08:00:00+08:00 95.071431
2023-11-01 16:00:00+08:00 73.199394
2023-11-02 00:00:00+08:00 59.865848
2023-11-02 08:00:00+08:00 15.601864
2023-11-02 16:00:00+08:00 15.599452
2023-11-03 00:00:00+08:00 5.808361
2023-11-03 08:00:00+08:00 86.617615
2023-11-03 16:00:00+08:00 60.111501
2023-11-04 00:00:00+08:00 70.807258
2023-11-04 08:00:00+08:00 2.058449
2023-11-04 16:00:00+08:00 96.990985
2023-11-06 00:00:00+08:00 NaN
2023-11-06 08:00:00+08:00 NaN
2023-11-06 16:00:00+08:00 NaN
2023-11-07 00:00:00+08:00 18.340451
2023-11-07 08:00:00+08:00 30.424224
2023-11-07 16:00:00+08:00 52.475643
shape: (9, 3)
┌────────────┬───────────────────────────┬───────────┐
│ describe ┆ datetime ┆ nrs │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ f64 │
╞════════════╪═══════════════════════════╪═══════════╡
│ count ┆ 18 ┆ 18.0 │
│ null_count ┆ 0 ┆ 3.0 │
│ mean ┆ null ┆ 48.028433 │
│ std ┆ null ┆ 32.579411 │
│ min ┆ 2023-11-01 00:00:00+08:00 ┆ 2.058449 │
│ 25% ┆ null ┆ 15.601864 │
│ 50% ┆ null ┆ 52.475643 │
│ 75% ┆ null ┆ 73.199394 │
│ max ┆ 2023-11-07 16:00:00+08:00 ┆ 96.990985 │
└────────────┴───────────────────────────┴───────────┘
Slice time series
Polars
relies on expressions
to slice time series data, while Pandas
can utilize the DatetimeIndex
.
t_start = datetime(2023, 11, 1, tzinfo=tzinfo)
t_end = datetime(2023, 11, 4, tzinfo=tzinfo)
out_pl = df_pl.filter(pl.col("datetime").is_between(t_start, t_end, closed="left"))
print(out_pl)
shape: (9, 2)
┌───────────────────────────┬───────────┐
│ datetime ┆ nrs │
│ --- ┆ --- │
│ datetime[ms, Asia/Taipei] ┆ f64 │
╞═══════════════════════════╪═══════════╡
│ 2023-11-01 00:00:00 CST ┆ 37.454012 │
│ 2023-11-01 08:00:00 CST ┆ 95.071431 │
│ 2023-11-01 16:00:00 CST ┆ 73.199394 │
│ 2023-11-02 00:00:00 CST ┆ 59.865848 │
│ 2023-11-02 08:00:00 CST ┆ 15.601864 │
│ 2023-11-02 16:00:00 CST ┆ 15.599452 │
│ 2023-11-03 00:00:00 CST ┆ 5.808361 │
│ 2023-11-03 08:00:00 CST ┆ 86.617615 │
│ 2023-11-03 16:00:00 CST ┆ 60.111501 │
└───────────────────────────┴───────────┘
(9, 1)
nrs
datetime
2023-11-01 00:00:00+08:00 37.454012
2023-11-01 08:00:00+08:00 95.071431
2023-11-01 16:00:00+08:00 73.199394
2023-11-02 00:00:00+08:00 59.865848
2023-11-02 08:00:00+08:00 15.601864
2023-11-02 16:00:00+08:00 15.599452
2023-11-03 00:00:00+08:00 5.808361
2023-11-03 08:00:00+08:00 86.617615
2023-11-03 16:00:00+08:00 60.111501
Resampling data by day
Polars
employs df.group_by_dynamic
for time series resampling, while Pandas
provides the df.resample
method. If you need continuous datetime datapoints, similar to Pandas
, you may find df.upsample
in Polars
to be useful.
shape: (6, 2)
┌───────────────────────────┬───────────┐
│ datetime ┆ nrs │
│ --- ┆ --- │
│ datetime[ms, Asia/Taipei] ┆ f64 │
╞═══════════════════════════╪═══════════╡
│ 2023-11-01 00:00:00 CST ┆ 68.574946 │
│ 2023-11-02 00:00:00 CST ┆ 30.355721 │
│ 2023-11-03 00:00:00 CST ┆ 50.845826 │
│ 2023-11-04 00:00:00 CST ┆ 56.618897 │
│ 2023-11-06 00:00:00 CST ┆ null │
│ 2023-11-07 00:00:00 CST ┆ 33.746773 │
└───────────────────────────┴───────────┘
out_pl = (
df_pl.group_by_dynamic("datetime", every="1d")
.agg(pl.col("nrs").mean())
.upsample("datetime", every="1d")
)
print(out_pl)
shape: (7, 2)
┌───────────────────────────┬───────────┐
│ datetime ┆ nrs │
│ --- ┆ --- │
│ datetime[ms, Asia/Taipei] ┆ f64 │
╞═══════════════════════════╪═══════════╡
│ 2023-11-01 00:00:00 CST ┆ 68.574946 │
│ 2023-11-02 00:00:00 CST ┆ 30.355721 │
│ 2023-11-03 00:00:00 CST ┆ 50.845826 │
│ 2023-11-04 00:00:00 CST ┆ 56.618897 │
│ 2023-11-05 00:00:00 CST ┆ null │
│ 2023-11-06 00:00:00 CST ┆ null │
│ 2023-11-07 00:00:00 CST ┆ 33.746773 │
└───────────────────────────┴───────────┘
Resampling data by custom time interval
At the time of writing, a complex combination for datetime, like 3d12h4m25s
will fail. In addition, to achieve the similar behavior like Pandas
, you might need to tweak the offset
or start_by
parameters of df.group_by_dynamic
in Polars
.
out_pl = (
df_pl.group_by_dynamic("datetime", every="16h30m", offset="4h30m")
.agg(pl.col("nrs").mean())
.upsample("datetime", every="16h30m")
)
print(out_pl)
shape: (10, 2)
┌───────────────────────────┬───────────┐
│ datetime ┆ nrs │
│ --- ┆ --- │
│ datetime[ms, Asia/Taipei] ┆ f64 │
╞═══════════════════════════╪═══════════╡
│ 2023-11-01 00:00:00 CST ┆ 68.574946 │
│ 2023-11-01 16:30:00 CST ┆ 37.733856 │
│ 2023-11-02 09:00:00 CST ┆ 10.703907 │
│ 2023-11-03 01:30:00 CST ┆ 73.364558 │
│ 2023-11-03 18:00:00 CST ┆ 36.432854 │
│ 2023-11-04 10:30:00 CST ┆ 96.990985 │
│ 2023-11-05 03:00:00 CST ┆ null │
│ 2023-11-05 19:30:00 CST ┆ null │
│ 2023-11-06 12:00:00 CST ┆ 18.340451 │
│ 2023-11-07 04:30:00 CST ┆ 41.449934 │
└───────────────────────────┴───────────┘
out_pl = (
df_pl.group_by_dynamic("datetime", every="16h30m", start_by="datapoint")
.agg(pl.col("nrs").mean())
.upsample("datetime", every="16h30m")
)
print(out_pl)
shape: (10, 2)
┌───────────────────────────┬───────────┐
│ datetime ┆ nrs │
│ --- ┆ --- │
│ datetime[ms, Asia/Taipei] ┆ f64 │
╞═══════════════════════════╪═══════════╡
│ 2023-11-01 00:00:00 CST ┆ 68.574946 │
│ 2023-11-01 16:30:00 CST ┆ 37.733856 │
│ 2023-11-02 09:00:00 CST ┆ 10.703907 │
│ 2023-11-03 01:30:00 CST ┆ 73.364558 │
│ 2023-11-03 18:00:00 CST ┆ 36.432854 │
│ 2023-11-04 10:30:00 CST ┆ 96.990985 │
│ 2023-11-05 03:00:00 CST ┆ null │
│ 2023-11-05 19:30:00 CST ┆ null │
│ 2023-11-06 12:00:00 CST ┆ 18.340451 │
│ 2023-11-07 04:30:00 CST ┆ 41.449934 │
└───────────────────────────┴───────────┘
(10, 1)
nrs
datetime
2023-11-01 00:00:00+08:00 68.574946
2023-11-01 16:30:00+08:00 37.733856
2023-11-02 09:00:00+08:00 10.703907
2023-11-03 01:30:00+08:00 73.364558
2023-11-03 18:00:00+08:00 36.432854
2023-11-04 10:30:00+08:00 96.990985
2023-11-05 03:00:00+08:00 NaN
2023-11-05 19:30:00+08:00 NaN
2023-11-06 12:00:00+08:00 18.340451
2023-11-07 04:30:00+08:00 41.449934
Resampling data by custom ending
We created new dataframes with more data points for Polars
and Pandas
, respectively, to provide a clearer illustration.
df_pl2 = (
pl.datetime_range(
date(2023, 1, 1),
date(2025, 1, 1),
timedelta(hours=12),
closed="left",
time_unit="ms",
time_zone=_tzinfo,
eager=True,
)
.to_frame()
.pipe(lambda df_: df_.with_columns(nrs=pl.lit(get_nrs(df_.height))))
)
with pl.Config(tbl_rows=10):
print(df_pl2)
shape: (1_462, 2)
┌───────────────────────────┬───────────┐
│ datetime ┆ nrs │
│ --- ┆ --- │
│ datetime[ms, Asia/Taipei] ┆ f64 │
╞═══════════════════════════╪═══════════╡
│ 2023-01-01 00:00:00 CST ┆ 43.194502 │
│ 2023-01-01 12:00:00 CST ┆ 29.122914 │
│ 2023-01-02 00:00:00 CST ┆ 61.185289 │
│ 2023-01-02 12:00:00 CST ┆ 13.949386 │
│ 2023-01-03 00:00:00 CST ┆ 29.214465 │
│ … ┆ … │
│ 2024-12-29 12:00:00 CST ┆ 66.759339 │
│ 2024-12-30 00:00:00 CST ┆ 80.784594 │
│ 2024-12-30 12:00:00 CST ┆ 76.228513 │
│ 2024-12-31 00:00:00 CST ┆ 79.781365 │
│ 2024-12-31 12:00:00 CST ┆ 43.558331 │
└───────────────────────────┴───────────┘
df_pd2 = (
pd.DataFrame(
{
"datetime": pd.date_range(
"2023-01-01", "2025-01-01", freq="12h", inclusive="left", tz=_tzinfo
)
}
)
.assign(nrs=lambda df_: get_nrs(df_.shape[0]))
.set_index("datetime")
)
print(df_pd2.shape)
print(df_pd2)
(1462, 1)
nrs
datetime
2023-01-01 00:00:00+08:00 43.194502
2023-01-01 12:00:00+08:00 29.122914
2023-01-02 00:00:00+08:00 61.185289
2023-01-02 12:00:00+08:00 13.949386
2023-01-03 00:00:00+08:00 29.214465
... ...
2024-12-29 12:00:00+08:00 66.759339
2024-12-30 00:00:00+08:00 80.784594
2024-12-30 12:00:00+08:00 76.228513
2024-12-31 00:00:00+08:00 79.781365
2024-12-31 12:00:00+08:00 43.558331
[1462 rows x 1 columns]
Achieving this can be done using anchored offset aliases
in Pandas
. To replicate this behavior in Polars
, you might need to make some adjustments.
out_pl = (
df_pl2.group_by_dynamic("datetime", every="1q", offset="-2mo", label="right")
.agg(pl.col("nrs").mean())
.with_columns(pl.col("datetime").dt.offset_by("-1d"))
)
print(out_pl)
shape: (9, 2)
┌───────────────────────────┬───────────┐
│ datetime ┆ nrs │
│ --- ┆ --- │
│ datetime[ms, Asia/Taipei] ┆ f64 │
╞═══════════════════════════╪═══════════╡
│ 2023-01-31 00:00:00 CST ┆ 46.516676 │
│ 2023-04-30 00:00:00 CST ┆ 50.302621 │
│ 2023-07-31 00:00:00 CST ┆ 49.712105 │
│ 2023-10-31 00:00:00 CST ┆ 51.120126 │
│ 2024-01-31 00:00:00 CST ┆ 49.375678 │
│ 2024-04-30 00:00:00 CST ┆ 45.447058 │
│ 2024-07-31 00:00:00 CST ┆ 54.128707 │
│ 2024-10-31 00:00:00 CST ┆ 49.911911 │
│ 2025-01-31 00:00:00 CST ┆ 52.447939 │
└───────────────────────────┴───────────┘
(9, 1)
nrs
datetime
2023-01-31 00:00:00+08:00 46.516676
2023-04-30 00:00:00+08:00 50.302621
2023-07-31 00:00:00+08:00 49.712105
2023-10-31 00:00:00+08:00 51.120126
2024-01-31 00:00:00+08:00 49.375678
2024-04-30 00:00:00+08:00 45.447058
2024-07-31 00:00:00+08:00 54.128707
2024-10-31 00:00:00+08:00 49.911911
2025-01-31 00:00:00+08:00 52.447939