Window functions
Setup
import pandas as pd
import polars as pl
url = "https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv"
shape: (163, 13)
┌─────┬───────────────────────┬─────────┬────────┬───┬─────────┬───────┬────────────┬───────────┐
│ # ┆ Name ┆ Type 1 ┆ Type 2 ┆ … ┆ Sp. Def ┆ Speed ┆ Generation ┆ Legendary │
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ str ┆ str ┆ ┆ i64 ┆ i64 ┆ i64 ┆ bool │
╞═════╪═══════════════════════╪═════════╪════════╪═══╪═════════╪═══════╪════════════╪═══════════╡
│ 1 ┆ Bulbasaur ┆ Grass ┆ Poison ┆ … ┆ 65 ┆ 45 ┆ 1 ┆ false │
│ 2 ┆ Ivysaur ┆ Grass ┆ Poison ┆ … ┆ 80 ┆ 60 ┆ 1 ┆ false │
│ 3 ┆ Venusaur ┆ Grass ┆ Poison ┆ … ┆ 100 ┆ 80 ┆ 1 ┆ false │
│ 3 ┆ VenusaurMega Venusaur ┆ Grass ┆ Poison ┆ … ┆ 120 ┆ 80 ┆ 1 ┆ false │
│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │
│ 147 ┆ Dratini ┆ Dragon ┆ null ┆ … ┆ 50 ┆ 50 ┆ 1 ┆ false │
│ 148 ┆ Dragonair ┆ Dragon ┆ null ┆ … ┆ 70 ┆ 70 ┆ 1 ┆ false │
│ 149 ┆ Dragonite ┆ Dragon ┆ Flying ┆ … ┆ 100 ┆ 80 ┆ 1 ┆ false │
│ 150 ┆ Mewtwo ┆ Psychic ┆ null ┆ … ┆ 90 ┆ 130 ┆ 1 ┆ true │
└─────┴───────────────────────┴─────────┴────────┴───┴─────────┴───────┴────────────┴───────────┘
# Name Type 1 Type 2 Total HP ... Defense Sp. Atk Sp. Def Speed Generation Legendary
0 1 Bulbasaur Grass Poison 318 45 ... 49 65 65 45 1 False
1 2 Ivysaur Grass Poison 405 60 ... 63 80 80 60 1 False
2 3 Venusaur Grass Poison 525 80 ... 83 100 100 80 1 False
3 3 VenusaurMega Venusaur Grass Poison 625 80 ... 123 122 120 80 1 False
4 4 Charmander Fire NaN 309 39 ... 43 60 50 65 1 False
.. ... ... ... ... ... ... ... ... ... ... ... ... ...
158 146 Moltres Fire Flying 580 90 ... 90 125 85 90 1 True
159 147 Dratini Dragon NaN 300 41 ... 45 50 50 50 1 False
160 148 Dragonair Dragon NaN 420 61 ... 65 70 70 70 1 False
161 149 Dragonite Dragon Flying 600 91 ... 95 100 100 80 1 False
162 150 Mewtwo Psychic NaN 680 106 ... 90 154 90 130 1 True
[163 rows x 13 columns]
Group by aggregations in selection
To ensure comparable results between Polars
and Pandas
in Type 2
, it's necessary to handle NaN
values. We can achieve this by filling the NaN
values with a specified placeholder
in Pandas
.
out_pl = df_pl.select(
"Type 1",
"Type 2",
pl.col("Attack").mean().over("Type 1").alias("avg_attack_by_type"),
pl.col("Defense")
.mean()
.over(["Type 1", "Type 2"])
.alias("avg_defense_by_type_combination"),
pl.col("Attack").mean().alias("avg_attack"),
)
print(out_pl)
shape: (163, 5)
┌─────────┬────────┬────────────────────┬─────────────────────────────────┬────────────┐
│ Type 1 ┆ Type 2 ┆ avg_attack_by_type ┆ avg_defense_by_type_combination ┆ avg_attack │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ f64 ┆ f64 ┆ f64 │
╞═════════╪════════╪════════════════════╪═════════════════════════════════╪════════════╡
│ Grass ┆ Poison ┆ 72.923077 ┆ 67.8 ┆ 75.349693 │
│ Grass ┆ Poison ┆ 72.923077 ┆ 67.8 ┆ 75.349693 │
│ Grass ┆ Poison ┆ 72.923077 ┆ 67.8 ┆ 75.349693 │
│ Grass ┆ Poison ┆ 72.923077 ┆ 67.8 ┆ 75.349693 │
│ … ┆ … ┆ … ┆ … ┆ … │
│ Dragon ┆ null ┆ 94.0 ┆ 55.0 ┆ 75.349693 │
│ Dragon ┆ null ┆ 94.0 ┆ 55.0 ┆ 75.349693 │
│ Dragon ┆ Flying ┆ 94.0 ┆ 95.0 ┆ 75.349693 │
│ Psychic ┆ null ┆ 53.875 ┆ 51.428571 ┆ 75.349693 │
└─────────┴────────┴────────────────────┴─────────────────────────────────┴────────────┘
out_pd = (
df_pd.loc[:, ["Type 1", "Type 2", "Attack", "Defense"]]
.assign(
avg_attack_by_type=lambda df_: df_[["Type 1", "Attack"]]
.groupby(["Type 1"])
.transform("mean"),
avg_defense_by_type_combination=lambda df_: (
df_[["Type 1", "Type 2", "Defense"]]
.fillna({"Type 2": "placeholder"})
.groupby(["Type 1", "Type 2"])
.transform("mean")
),
avg_attack=lambda df_: df_.Attack.mean(),
)
.drop(columns=["Attack", "Defense"])
)
print(out_pd)
Type 1 Type 2 avg_attack_by_type avg_defense_by_type_combination avg_attack
0 Grass Poison 72.923077 67.800000 75.349693
1 Grass Poison 72.923077 67.800000 75.349693
2 Grass Poison 72.923077 67.800000 75.349693
3 Grass Poison 72.923077 67.800000 75.349693
4 Fire NaN 88.642857 58.300000 75.349693
.. ... ... ... ... ...
158 Fire Flying 88.642857 82.000000 75.349693
159 Dragon NaN 94.000000 55.000000 75.349693
160 Dragon NaN 94.000000 55.000000 75.349693
161 Dragon Flying 94.000000 95.000000 75.349693
162 Psychic NaN 53.875000 51.428571 75.349693
[163 rows x 5 columns]
Operations per group
To achieve consistent results between Polars
and Pandas
, a trick is employed: duplicate a dummy column from Type 1
to use as the by
parameter in the groupby(by=..)
operation in Pandas
.
This version maintains clarity while shortening the sentence slightly.
out_pl = (
df_pl.filter(pl.col("Type 2") == "Psychic")
.select("Name", "Type 1", "Speed")
.with_columns(
pl.col(["Name", "Speed"]).sort_by("Speed", descending=True).over("Type 1")
)
)
print(out_pl)
shape: (7, 3)
┌─────────────────────┬────────┬───────┐
│ Name ┆ Type 1 ┆ Speed │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ i64 │
╞═════════════════════╪════════╪═══════╡
│ Starmie ┆ Water ┆ 115 │
│ Slowbro ┆ Water ┆ 30 │
│ SlowbroMega Slowbro ┆ Water ┆ 30 │
│ Exeggutor ┆ Grass ┆ 55 │
│ Exeggcute ┆ Grass ┆ 40 │
│ Slowpoke ┆ Water ┆ 15 │
│ Jynx ┆ Ice ┆ 95 │
└─────────────────────┴────────┴───────┘
One more Example
out_pl = df_pl.sort("Type 1").select(
pl.col("Type 1").head(3).over("Type 1", mapping_strategy="explode"),
pl.col("Name")
.sort_by(pl.col("Speed"), descending=True)
.head(3)
.over("Type 1", mapping_strategy="explode")
.alias("fastest/group"),
pl.col("Name")
.sort_by(pl.col("Attack"), descending=True)
.head(3)
.over("Type 1", mapping_strategy="explode")
.alias("strongest/group"),
pl.col("Name")
.sort()
.head(3)
.over("Type 1", mapping_strategy="explode")
.alias("sorted_by_alphabet"),
)
print(out_pl)
shape: (43, 4)
┌────────┬───────────────────────┬───────────────────────┬─────────────────────────┐
│ Type 1 ┆ fastest/group ┆ strongest/group ┆ sorted_by_alphabet │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ str │
╞════════╪═══════════════════════╪═══════════════════════╪═════════════════════════╡
│ Bug ┆ BeedrillMega Beedrill ┆ PinsirMega Pinsir ┆ Beedrill │
│ Bug ┆ Scyther ┆ BeedrillMega Beedrill ┆ BeedrillMega Beedrill │
│ Bug ┆ PinsirMega Pinsir ┆ Pinsir ┆ Butterfree │
│ Dragon ┆ Dragonite ┆ Dragonite ┆ Dragonair │
│ … ┆ … ┆ … ┆ … │
│ Rock ┆ Kabutops ┆ Kabutops ┆ Geodude │
│ Water ┆ Starmie ┆ GyaradosMega Gyarados ┆ Blastoise │
│ Water ┆ Tentacruel ┆ Kingler ┆ BlastoiseMega Blastoise │
│ Water ┆ Poliwag ┆ Gyarados ┆ Cloyster │
└────────┴───────────────────────┴───────────────────────┴─────────────────────────┘
def _process_speed(df_):
return (
df_.loc[:, ["Type 1", "Speed", "Name"]]
.sort_values("Speed", ascending=False)
.groupby("Type 1")
.agg(list)
.assign(**{"fastest/group": lambda df_: df_["Name"].str.slice(0, 3)})
.explode("fastest/group")
.loc[:, "fastest/group"]
)
def _process_attack(df_):
return (
df_.loc[:, ["Type 1", "Attack", "Name"]]
.sort_values("Attack", ascending=False)
.groupby("Type 1")
.agg(list)
.assign(**{"strongest/group": lambda df_: df_["Name"].str.slice(0, 3)})
.explode("strongest/group")
.loc[:, "strongest/group"]
)
def _process_name(df_):
return (
df_.loc[:, ["Type 1", "Name"]]
.sort_values("Name")
.groupby("Type 1")
.agg(list)
.assign(**{"sorted_by_alphabet": lambda df_: df_["Name"].str.slice(0, 3)})
.explode("sorted_by_alphabet")
.loc[:, "sorted_by_alphabet"]
)
def window_pipeline(df_):
s_speed = _process_speed(df_)
s_attack = _process_attack(df_)
s_name = _process_name(df_)
return pd.concat([s_speed, s_attack, s_name], axis="columns")
out_pd = (
df_pd.loc[:, ["Type 1", "Speed", "Attack", "Name"]]
.pipe(window_pipeline)
.reset_index()
)
with pd.option_context("display.max_rows", 15):
print(out_pd)
Type 1 fastest/group strongest/group sorted_by_alphabet
0 Bug BeedrillMega Beedrill PinsirMega Pinsir Beedrill
1 Bug Scyther BeedrillMega Beedrill BeedrillMega Beedrill
2 Bug PinsirMega Pinsir Pinsir Butterfree
3 Dragon Dragonite Dragonite Dragonair
4 Dragon Dragonair Dragonair Dragonite
.. ... ... ... ...
38 Rock Aerodactyl Golem AerodactylMega Aerodactyl
39 Rock Kabutops Kabutops Geodude
40 Water Starmie GyaradosMega Gyarados Blastoise
41 Water Tentacruel Kingler BlastoiseMega Blastoise
42 Water Poliwag Gyarados Cloyster
[43 rows x 4 columns]
Reference
The examples in this section have been adapted from the Polars
user guide.