Skip to content

Window functions

Setup

import pandas as pd
import polars as pl

url = "https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv"

df_pl = pl.read_csv(url)
print(df_pl)

shape: (163, 13)
┌─────┬───────────────────────┬─────────┬────────┬───┬─────────┬───────┬────────────┬───────────┐
│ #   ┆ Name                  ┆ Type 1  ┆ Type 2 ┆ … ┆ Sp. Def ┆ Speed ┆ Generation ┆ Legendary │
│ --- ┆ ---                   ┆ ---     ┆ ---    ┆   ┆ ---     ┆ ---   ┆ ---        ┆ ---       │
│ i64 ┆ str                   ┆ str     ┆ str    ┆   ┆ i64     ┆ i64   ┆ i64        ┆ bool      │
╞═════╪═══════════════════════╪═════════╪════════╪═══╪═════════╪═══════╪════════════╪═══════════╡
│ 1   ┆ Bulbasaur             ┆ Grass   ┆ Poison ┆ … ┆ 65      ┆ 45    ┆ 1          ┆ false     │
│ 2   ┆ Ivysaur               ┆ Grass   ┆ Poison ┆ … ┆ 80      ┆ 60    ┆ 1          ┆ false     │
│ 3   ┆ Venusaur              ┆ Grass   ┆ Poison ┆ … ┆ 100     ┆ 80    ┆ 1          ┆ false     │
│ 3   ┆ VenusaurMega Venusaur ┆ Grass   ┆ Poison ┆ … ┆ 120     ┆ 80    ┆ 1          ┆ false     │
│ …   ┆ …                     ┆ …       ┆ …      ┆ … ┆ …       ┆ …     ┆ …          ┆ …         │
│ 147 ┆ Dratini               ┆ Dragon  ┆ null   ┆ … ┆ 50      ┆ 50    ┆ 1          ┆ false     │
│ 148 ┆ Dragonair             ┆ Dragon  ┆ null   ┆ … ┆ 70      ┆ 70    ┆ 1          ┆ false     │
│ 149 ┆ Dragonite             ┆ Dragon  ┆ Flying ┆ … ┆ 100     ┆ 80    ┆ 1          ┆ false     │
│ 150 ┆ Mewtwo                ┆ Psychic ┆ null   ┆ … ┆ 90      ┆ 130   ┆ 1          ┆ true      │
└─────┴───────────────────────┴─────────┴────────┴───┴─────────┴───────┴────────────┴───────────┘

df_pd = pd.read_csv(url)
print(df_pd)

       #                   Name   Type 1  Type 2  Total   HP  ...  Defense  Sp. Atk  Sp. Def  Speed  Generation  Legendary
0      1              Bulbasaur    Grass  Poison    318   45  ...       49       65       65     45           1      False
1      2                Ivysaur    Grass  Poison    405   60  ...       63       80       80     60           1      False
2      3               Venusaur    Grass  Poison    525   80  ...       83      100      100     80           1      False
3      3  VenusaurMega Venusaur    Grass  Poison    625   80  ...      123      122      120     80           1      False
4      4             Charmander     Fire     NaN    309   39  ...       43       60       50     65           1      False
..   ...                    ...      ...     ...    ...  ...  ...      ...      ...      ...    ...         ...        ...
158  146                Moltres     Fire  Flying    580   90  ...       90      125       85     90           1       True
159  147                Dratini   Dragon     NaN    300   41  ...       45       50       50     50           1      False
160  148              Dragonair   Dragon     NaN    420   61  ...       65       70       70     70           1      False
161  149              Dragonite   Dragon  Flying    600   91  ...       95      100      100     80           1      False
162  150                 Mewtwo  Psychic     NaN    680  106  ...       90      154       90    130           1       True

[163 rows x 13 columns]

Group by aggregations in selection

To ensure comparable results between Polars and Pandas in Type 2, it's necessary to handle NaN values. We can achieve this by filling the NaN values with a specified placeholder in Pandas.

out_pl = df_pl.select(
    "Type 1",
    "Type 2",
    pl.col("Attack").mean().over("Type 1").alias("avg_attack_by_type"),
    pl.col("Defense")
    .mean()
    .over(["Type 1", "Type 2"])
    .alias("avg_defense_by_type_combination"),
    pl.col("Attack").mean().alias("avg_attack"),
)
print(out_pl)

shape: (163, 5)
┌─────────┬────────┬────────────────────┬─────────────────────────────────┬────────────┐
│ Type 1  ┆ Type 2 ┆ avg_attack_by_type ┆ avg_defense_by_type_combination ┆ avg_attack │
│ ---     ┆ ---    ┆ ---                ┆ ---                             ┆ ---        │
│ str     ┆ str    ┆ f64                ┆ f64                             ┆ f64        │
╞═════════╪════════╪════════════════════╪═════════════════════════════════╪════════════╡
│ Grass   ┆ Poison ┆ 72.923077          ┆ 67.8                            ┆ 75.349693  │
│ Grass   ┆ Poison ┆ 72.923077          ┆ 67.8                            ┆ 75.349693  │
│ Grass   ┆ Poison ┆ 72.923077          ┆ 67.8                            ┆ 75.349693  │
│ Grass   ┆ Poison ┆ 72.923077          ┆ 67.8                            ┆ 75.349693  │
│ …       ┆ …      ┆ …                  ┆ …                               ┆ …          │
│ Dragon  ┆ null   ┆ 94.0               ┆ 55.0                            ┆ 75.349693  │
│ Dragon  ┆ null   ┆ 94.0               ┆ 55.0                            ┆ 75.349693  │
│ Dragon  ┆ Flying ┆ 94.0               ┆ 95.0                            ┆ 75.349693  │
│ Psychic ┆ null   ┆ 53.875             ┆ 51.428571                       ┆ 75.349693  │
└─────────┴────────┴────────────────────┴─────────────────────────────────┴────────────┘

out_pd = (
    df_pd.loc[:, ["Type 1", "Type 2", "Attack", "Defense"]]
    .assign(
        avg_attack_by_type=lambda df_: df_[["Type 1", "Attack"]]
        .groupby(["Type 1"])
        .transform("mean"),
        avg_defense_by_type_combination=lambda df_: (
            df_[["Type 1", "Type 2", "Defense"]]
            .fillna({"Type 2": "placeholder"})
            .groupby(["Type 1", "Type 2"])
            .transform("mean")
        ),
        avg_attack=lambda df_: df_.Attack.mean(),
    )
    .drop(columns=["Attack", "Defense"])
)
print(out_pd)

      Type 1  Type 2  avg_attack_by_type  avg_defense_by_type_combination  avg_attack
0      Grass  Poison           72.923077                        67.800000   75.349693
1      Grass  Poison           72.923077                        67.800000   75.349693
2      Grass  Poison           72.923077                        67.800000   75.349693
3      Grass  Poison           72.923077                        67.800000   75.349693
4       Fire     NaN           88.642857                        58.300000   75.349693
..       ...     ...                 ...                              ...         ...
158     Fire  Flying           88.642857                        82.000000   75.349693
159   Dragon     NaN           94.000000                        55.000000   75.349693
160   Dragon     NaN           94.000000                        55.000000   75.349693
161   Dragon  Flying           94.000000                        95.000000   75.349693
162  Psychic     NaN           53.875000                        51.428571   75.349693

[163 rows x 5 columns]

Operations per group

To achieve consistent results between Polars and Pandas, a trick is employed: duplicate a dummy column from Type 1 to use as the by parameter in the groupby(by=..) operation in Pandas.

This version maintains clarity while shortening the sentence slightly.

out_pl = (
    df_pl.filter(pl.col("Type 2") == "Psychic")
    .select("Name", "Type 1", "Speed")
    .with_columns(
        pl.col(["Name", "Speed"]).sort_by("Speed", descending=True).over("Type 1")
    )
)
print(out_pl)

shape: (7, 3)
┌─────────────────────┬────────┬───────┐
│ Name                ┆ Type 1 ┆ Speed │
│ ---                 ┆ ---    ┆ ---   │
│ str                 ┆ str    ┆ i64   │
╞═════════════════════╪════════╪═══════╡
│ Starmie             ┆ Water  ┆ 115   │
│ Slowbro             ┆ Water  ┆ 30    │
│ SlowbroMega Slowbro ┆ Water  ┆ 30    │
│ Exeggutor           ┆ Grass  ┆ 55    │
│ Exeggcute           ┆ Grass  ┆ 40    │
│ Slowpoke            ┆ Water  ┆ 15    │
│ Jynx                ┆ Ice    ┆ 95    │
└─────────────────────┴────────┴───────┘

out_pd = (
    df_pd.query("`Type 2` == 'Psychic'")
    .loc[:, ["Name", "Type 1", "Speed"]]
    .assign(type1_tmp=lambda df_: df_["Type 1"])
    .groupby("type1_tmp")
    .transform(lambda g: sorted(g, reverse=True))
)
print(out_pd)

                    Name Type 1  Speed
85               Starmie  Water    115
86              Slowpoke  Water     30
87   SlowbroMega Slowbro  Water     30
110            Exeggutor  Grass     55
111            Exeggcute  Grass     40
130              Slowbro  Water     15
133                 Jynx    Ice     95

One more Example

out_pl = df_pl.sort("Type 1").select(
    pl.col("Type 1").head(3).over("Type 1", mapping_strategy="explode"),
    pl.col("Name")
    .sort_by(pl.col("Speed"), descending=True)
    .head(3)
    .over("Type 1", mapping_strategy="explode")
    .alias("fastest/group"),
    pl.col("Name")
    .sort_by(pl.col("Attack"), descending=True)
    .head(3)
    .over("Type 1", mapping_strategy="explode")
    .alias("strongest/group"),
    pl.col("Name")
    .sort()
    .head(3)
    .over("Type 1", mapping_strategy="explode")
    .alias("sorted_by_alphabet"),
)
print(out_pl)

shape: (43, 4)
┌────────┬───────────────────────┬───────────────────────┬─────────────────────────┐
│ Type 1 ┆ fastest/group         ┆ strongest/group       ┆ sorted_by_alphabet      │
│ ---    ┆ ---                   ┆ ---                   ┆ ---                     │
│ str    ┆ str                   ┆ str                   ┆ str                     │
╞════════╪═══════════════════════╪═══════════════════════╪═════════════════════════╡
│ Bug    ┆ BeedrillMega Beedrill ┆ PinsirMega Pinsir     ┆ Beedrill                │
│ Bug    ┆ Scyther               ┆ BeedrillMega Beedrill ┆ BeedrillMega Beedrill   │
│ Bug    ┆ PinsirMega Pinsir     ┆ Pinsir                ┆ Butterfree              │
│ Dragon ┆ Dragonite             ┆ Dragonite             ┆ Dragonair               │
│ …      ┆ …                     ┆ …                     ┆ …                       │
│ Rock   ┆ Kabutops              ┆ Kabutops              ┆ Geodude                 │
│ Water  ┆ Starmie               ┆ GyaradosMega Gyarados ┆ Blastoise               │
│ Water  ┆ Tentacruel            ┆ Kingler               ┆ BlastoiseMega Blastoise │
│ Water  ┆ Poliwag               ┆ Gyarados              ┆ Cloyster                │
└────────┴───────────────────────┴───────────────────────┴─────────────────────────┘

def _process_speed(df_):
    return (
        df_.loc[:, ["Type 1", "Speed", "Name"]]
        .sort_values("Speed", ascending=False)
        .groupby("Type 1")
        .agg(list)
        .assign(**{"fastest/group": lambda df_: df_["Name"].str.slice(0, 3)})
        .explode("fastest/group")
        .loc[:, "fastest/group"]
    )


def _process_attack(df_):
    return (
        df_.loc[:, ["Type 1", "Attack", "Name"]]
        .sort_values("Attack", ascending=False)
        .groupby("Type 1")
        .agg(list)
        .assign(**{"strongest/group": lambda df_: df_["Name"].str.slice(0, 3)})
        .explode("strongest/group")
        .loc[:, "strongest/group"]
    )


def _process_name(df_):
    return (
        df_.loc[:, ["Type 1", "Name"]]
        .sort_values("Name")
        .groupby("Type 1")
        .agg(list)
        .assign(**{"sorted_by_alphabet": lambda df_: df_["Name"].str.slice(0, 3)})
        .explode("sorted_by_alphabet")
        .loc[:, "sorted_by_alphabet"]
    )


def window_pipeline(df_):
    s_speed = _process_speed(df_)
    s_attack = _process_attack(df_)
    s_name = _process_name(df_)
    return pd.concat([s_speed, s_attack, s_name], axis="columns")


out_pd = (
    df_pd.loc[:, ["Type 1", "Speed", "Attack", "Name"]]
    .pipe(window_pipeline)
    .reset_index()
)
with pd.option_context("display.max_rows", 15):
    print(out_pd)

    Type 1          fastest/group        strongest/group         sorted_by_alphabet
0      Bug  BeedrillMega Beedrill      PinsirMega Pinsir                   Beedrill
1      Bug                Scyther  BeedrillMega Beedrill      BeedrillMega Beedrill
2      Bug      PinsirMega Pinsir                 Pinsir                 Butterfree
3   Dragon              Dragonite              Dragonite                  Dragonair
4   Dragon              Dragonair              Dragonair                  Dragonite
..     ...                    ...                    ...                        ...
38    Rock             Aerodactyl                  Golem  AerodactylMega Aerodactyl
39    Rock               Kabutops               Kabutops                    Geodude
40   Water                Starmie  GyaradosMega Gyarados                  Blastoise
41   Water             Tentacruel                Kingler    BlastoiseMega Blastoise
42   Water                Poliwag               Gyarados                   Cloyster

[43 rows x 4 columns]

Reference

The examples in this section have been adapted from the Polars user guide.