List
Powerful List manipulation
Setup
import pandas as pd
import polars as pl
data = {
"station": ["Station " + str(x) for x in range(1, 6)],
"temperatures": [
"20 5 5 E1 7 13 19 9 6 20",
"18 8 16 11 23 E2 8 E2 E2 E2 90 70 40",
"19 24 E9 16 6 12 10 22",
"E2 E0 15 7 8 10 E1 24 17 13 6",
"14 8 E0 16 22 24 E1",
],
}
pd.set_option("display.max_seq_items", 3)
shape: (5, 2)
┌───────────┬───────────────────────────────────┐
│ station ┆ temperatures │
│ --- ┆ --- │
│ str ┆ str │
╞═══════════╪═══════════════════════════════════╡
│ Station 1 ┆ 20 5 5 E1 7 13 19 9 6 20 │
│ Station 2 ┆ 18 8 16 11 23 E2 8 E2 E2 E2 90 7… │
│ Station 3 ┆ 19 24 E9 16 6 12 10 22 │
│ Station 4 ┆ E2 E0 15 7 8 10 E1 24 17 13 6 │
│ Station 5 ┆ 14 8 E0 16 22 24 E1 │
└───────────┴───────────────────────────────────┘
Creating a List column
shape: (5, 2)
┌───────────┬──────────────────────┐
│ station ┆ temperatures │
│ --- ┆ --- │
│ str ┆ list[str] │
╞═══════════╪══════════════════════╡
│ Station 1 ┆ ["20", "5", … "20"] │
│ Station 2 ┆ ["18", "8", … "40"] │
│ Station 3 ┆ ["19", "24", … "22"] │
│ Station 4 ┆ ["E2", "E0", … "6"] │
│ Station 5 ┆ ["14", "8", … "E1"] │
└───────────┴──────────────────────┘
Operating on List columns
out_pl = df_pl.with_columns(pl.col("temperatures").str.split(" ")).with_columns(
pl.col("temperatures").list.head(3).alias("top3"),
pl.col("temperatures").list.slice(-3, 3).alias("bottom_3"),
pl.col("temperatures").list.len().alias("obs"),
)
print(out_pl)
shape: (5, 5)
┌───────────┬──────────────────────┬────────────────────┬────────────────────┬─────┐
│ station ┆ temperatures ┆ top3 ┆ bottom_3 ┆ obs │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ list[str] ┆ list[str] ┆ list[str] ┆ u32 │
╞═══════════╪══════════════════════╪════════════════════╪════════════════════╪═════╡
│ Station 1 ┆ ["20", "5", … "20"] ┆ ["20", "5", "5"] ┆ ["9", "6", "20"] ┆ 10 │
│ Station 2 ┆ ["18", "8", … "40"] ┆ ["18", "8", "16"] ┆ ["90", "70", "40"] ┆ 13 │
│ Station 3 ┆ ["19", "24", … "22"] ┆ ["19", "24", "E9"] ┆ ["12", "10", "22"] ┆ 8 │
│ Station 4 ┆ ["E2", "E0", … "6"] ┆ ["E2", "E0", "15"] ┆ ["17", "13", "6"] ┆ 11 │
│ Station 5 ┆ ["14", "8", … "E1"] ┆ ["14", "8", "E0"] ┆ ["22", "24", "E1"] ┆ 7 │
└───────────┴──────────────────────┴────────────────────┴────────────────────┴─────┘
from operator import itemgetter # noqa: E402
out_pd = df_pd.assign(
temperatures=lambda df_: df_.temperatures.str.split(),
top3=lambda df_: df_.temperatures.apply(itemgetter(slice(None, 3))),
bottom_3=lambda df_: df_.temperatures.apply(itemgetter(slice(-3, None))),
obs=lambda df_: df_.temperatures.apply(len),
)
print(out_pd)
station temperatures top3 bottom_3 obs
0 Station 1 [20, 5, 5, ...] [20, 5, 5] [9, 6, 20] 10
1 Station 2 [18, 8, 16, ...] [18, 8, 16] [90, 70, 40] 13
2 Station 3 [19, 24, E9, ...] [19, 24, E9] [12, 10, 22] 8
3 Station 4 [E2, E0, 15, ...] [E2, E0, 15] [17, 13, 6] 11
4 Station 5 [14, 8, E0, ...] [14, 8, E0] [22, 24, E1] 7
Element-wise computation within Lists
out_pl = df_pl.with_columns(
pl.col("temperatures")
.str.split(" ")
.list.eval(pl.element().cast(pl.Int64, strict=False).is_null())
.list.sum()
.alias("errors")
)
print(out_pl)
shape: (5, 3)
┌───────────┬───────────────────────────────────┬────────┐
│ station ┆ temperatures ┆ errors │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ u32 │
╞═══════════╪═══════════════════════════════════╪════════╡
│ Station 1 ┆ 20 5 5 E1 7 13 19 9 6 20 ┆ 1 │
│ Station 2 ┆ 18 8 16 11 23 E2 8 E2 E2 E2 90 7… ┆ 4 │
│ Station 3 ┆ 19 24 E9 16 6 12 10 22 ┆ 1 │
│ Station 4 ┆ E2 E0 15 7 8 10 E1 24 17 13 6 ┆ 3 │
│ Station 5 ┆ 14 8 E0 16 22 24 E1 ┆ 2 │
└───────────┴───────────────────────────────────┴────────┘
Row-wise computations
data2 = {
"station": ["Station " + str(x) for x in range(1, 11)],
"day_1": [17, 11, 8, 22, 9, 21, 20, 8, 8, 17],
"day_2": [15, 11, 10, 8, 7, 14, 18, 21, 15, 13],
"day_3": [16, 15, 24, 24, 8, 23, 19, 23, 16, 10],
}
shape: (10, 4)
┌────────────┬───────┬───────┬───────┐
│ station ┆ day_1 ┆ day_2 ┆ day_3 │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 │
╞════════════╪═══════╪═══════╪═══════╡
│ Station 1 ┆ 17 ┆ 15 ┆ 16 │
│ Station 2 ┆ 11 ┆ 11 ┆ 15 │
│ Station 3 ┆ 8 ┆ 10 ┆ 24 │
│ Station 4 ┆ 22 ┆ 8 ┆ 24 │
│ … ┆ … ┆ … ┆ … │
│ Station 7 ┆ 20 ┆ 18 ┆ 19 │
│ Station 8 ┆ 8 ┆ 21 ┆ 23 │
│ Station 9 ┆ 8 ┆ 15 ┆ 16 │
│ Station 10 ┆ 17 ┆ 13 ┆ 10 │
└────────────┴───────┴───────┴───────┘
rank_pct = (pl.element().rank(descending=True) / pl.col("*").count()).round(2)
out_pl = df_pl.with_columns(
pl.concat_list(pl.all().exclude("station")).alias("all_temps")
).select(
pl.all().exclude("all_temps"),
pl.col("all_temps").list.eval(rank_pct, parallel=True).alias("temps_rank"),
)
print(out_pl)
shape: (10, 5)
┌────────────┬───────┬───────┬───────┬────────────────────┐
│ station ┆ day_1 ┆ day_2 ┆ day_3 ┆ temps_rank │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 ┆ list[f64] │
╞════════════╪═══════╪═══════╪═══════╪════════════════════╡
│ Station 1 ┆ 17 ┆ 15 ┆ 16 ┆ [0.33, 1.0, 0.67] │
│ Station 2 ┆ 11 ┆ 11 ┆ 15 ┆ [0.83, 0.83, 0.33] │
│ Station 3 ┆ 8 ┆ 10 ┆ 24 ┆ [1.0, 0.67, 0.33] │
│ Station 4 ┆ 22 ┆ 8 ┆ 24 ┆ [0.67, 1.0, 0.33] │
│ … ┆ … ┆ … ┆ … ┆ … │
│ Station 7 ┆ 20 ┆ 18 ┆ 19 ┆ [0.33, 1.0, 0.67] │
│ Station 8 ┆ 8 ┆ 21 ┆ 23 ┆ [1.0, 0.67, 0.33] │
│ Station 9 ┆ 8 ┆ 15 ┆ 16 ┆ [1.0, 0.67, 0.33] │
│ Station 10 ┆ 17 ┆ 13 ┆ 10 ┆ [0.33, 0.67, 1.0] │
└────────────┴───────┴───────┴───────┴────────────────────┘
def cal_rank_pct(df_):
n_days = df_.columns.drop(["station"]).size
return (
df_.drop(columns=["station"])
.rank(axis="columns", ascending=False)
.div(n_days)
.round(2)
.agg(list, axis="columns")
)
out_pd = df_pd.assign(temps_rank=cal_rank_pct)
print(out_pd)
station day_1 day_2 day_3 temps_rank
0 Station 1 17 15 16 [0.33, 1.0, 0.67]
1 Station 2 11 11 15 [0.83, 0.83, 0.33]
2 Station 3 8 10 24 [1.0, 0.67, 0.33]
3 Station 4 22 8 24 [0.67, 1.0, 0.33]
4 Station 5 9 7 8 [0.33, 1.0, 0.67]
5 Station 6 21 14 23 [0.67, 1.0, 0.33]
6 Station 7 20 18 19 [0.33, 1.0, 0.67]
7 Station 8 8 21 23 [1.0, 0.67, 0.33]
8 Station 9 8 15 16 [1.0, 0.67, 0.33]
9 Station 10 17 13 10 [0.33, 0.67, 1.0]
Reference
The examples in this section have been adapted from the Polars
user guide.