Aggregation

Setup

from datetime import date

import pandas as pd
import polars as pl

pl.enable_string_cache()
pd.set_option("display.max_seq_items", 3)
url = "https://theunitedstates.io/congress-legislators/legislators-historical.csv"

Polars_dfPandas_df

dtypes = {
    "first_name": pl.Categorical,
    "gender": pl.Categorical,
    "type": pl.Categorical,
    "state": pl.Categorical,
    "party": pl.Categorical,
}


df_pl = pl.read_csv(url, dtypes=dtypes).with_columns(
    pl.col("birthday").str.to_date(strict=False)
)
print(df_pl)

┌─── │ │ --- │ str │ ╞═══ │ Bassett │ │ │ Bland │ │ │ Burke │ Carroll │ … │ Sasse │ │ │ │ │ Stewart │ └──

href="#__codelineno-0-1">shape: (12_139, 36) ────────┬────────────┬─────────────┬────────┬───┬──────────────────┬──────────────────┬──────────┬─────────────────┐ last_name ┆ first_name ┆ middle_name ┆ suffix ┆ … ┆ ballotpedia_id ┆ washington_post_ ┆ icpsr_id ┆ wikipedia_id │ ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ id ┆ --- ┆ --- │ ┆ cat ┆ str ┆ str ┆ ┆ str ┆ --- ┆ i64 ┆ str │ ┆ ┆ ┆ ┆ ┆ ┆ str ┆ ┆ │ ════════╪════════════╪═════════════╪════════╪═══╪══════════════════╪══════════════════╪══════════╪═════════════════╡ ┆ Richard ┆ null ┆ null ┆ … ┆ null ┆ null ┆ 507 ┆ Richard Bassett │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ (Delaware │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ politi… │ ┆ Theodorick ┆ null ┆ null ┆ … ┆ null ┆ null ┆ 786 ┆ Theodorick │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Bland │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ (congressman) │ ┆ Aedanus ┆ null ┆ null ┆ … ┆ null ┆ null ┆ 1260 ┆ Aedanus Burke │ ┆ Daniel ┆ null ┆ null ┆ … ┆ null ┆ null ┆ 1538 ┆ Daniel Carroll │ ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │ ┆ Benjamin ┆ Eric ┆ null ┆ … ┆ Ben Sasse ┆ null ┆ 41503 ┆ Ben Sasse │ Feinstein ┆ Dianne ┆ null ┆ null ┆ … ┆ Dianne Feinstein ┆ null ┆ 49300 ┆ Dianne │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Feinstein │ Cicilline ┆ David ┆ N. ┆ null ┆ … ┆ David N. ┆ null ┆ 21172 ┆ David Cicilline │ ┆ ┆ ┆ ┆ ┆ Cicilline ┆ ┆ ┆ │ ┆ Chris ┆ null ┆ null ┆ … ┆ Chris Stewart ┆ null ┆ 21367 ┆ Chris Stewart │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ (politician) │ ─────────┴────────────┴─────────────┴────────┴───┴──────────────────┴──────────────────┴──────────┴─────────────────┘

dtype = {
    "first_name": "category",
    "gender": "category",
    "type": "category",
    "state": "category",
    "party": "category",
}
df_pd = pd.read_csv(url, dtype=dtype).assign(
    birthday=lambda df_: pd.to_datetime(df_.birthday, errors="coerce")
)
print(df_pd)

       last_name  first_name middle_name  ... washington_post_id icpsr_id                           wikipedia_id
0        Bassett     Richard         NaN  ...                NaN    507.0  Richard Bassett (Delaware politician)
1          Bland  Theodorick         NaN  ...                NaN    786.0         Theodorick Bland (congressman)
2          Burke     Aedanus         NaN  ...                NaN   1260.0                          Aedanus Burke
3        Carroll      Daniel         NaN  ...                NaN   1538.0                         Daniel Carroll
4         Clymer      George         NaN  ...                NaN   1859.0                          George Clymer
...          ...         ...         ...  ...                ...      ...                                    ...
12134     Inhofe       James          M.  ...                NaN  15424.0                             Jim Inhofe
12135      Sasse    Benjamin        Eric  ...                NaN  41503.0                              Ben Sasse
12136  Feinstein      Dianne         NaN  ...                NaN  49300.0                       Dianne Feinstein
12137  Cicilline       David          N.  ...                NaN  21172.0                        David Cicilline
12138    Stewart       Chris         NaN  ...                NaN  21367.0             Chris Stewart (politician)

[12139 rows x 36 columns]

Basic aggregations

PolarsPandas

out_pl = (
    df_pl.lazy()
    .group_by("first_name")
    .agg(pl.count(), pl.col("gender"), pl.first("last_name"))
    .sort("count", descending=True)
    .limit(5)
    .collect()
)
print(out_pl)

shape: (5, 4)
┌────────────┬───────┬───────────────────┬───────────┐
│ first_name ┆ count ┆ gender            ┆ last_name │
│ ---        ┆ ---   ┆ ---               ┆ ---       │
│ cat        ┆ u32   ┆ list[cat]         ┆ str       │
╞════════════╪═══════╪═══════════════════╪═══════════╡
│ John       ┆ 1256  ┆ ["M", "M", … "M"] ┆ Walker    │
│ William    ┆ 1022  ┆ ["M", "M", … "M"] ┆ Few       │
│ James      ┆ 714   ┆ ["M", "M", … "M"] ┆ Armstrong │
│ Thomas     ┆ 454   ┆ ["M", "M", … "M"] ┆ Tucker    │
│ Charles    ┆ 439   ┆ ["M", "M", … "M"] ┆ Carroll   │
└────────────┴───────┴───────────────────┴───────────┘

out_pd = (
    df_pd.assign(count=0)
    .groupby("first_name", observed=True)
    .agg(
        count=("first_name", "size"),
        gender=("gender", list),
        last_name=("last_name", "first"),
    )
    .sort_values("count", ascending=False)
    .reset_index()
    .head(5)
)
print(out_pd)

  first_name  count          gender  last_name
0       John   1256  [M, M, M, ...]     Walker
1    William   1022  [M, M, M, ...]        Few
2      James    714  [M, M, M, ...]  Armstrong
3     Thomas    454  [M, M, M, ...]     Tucker
4    Charles    439  [M, M, M, ...]    Carroll

Conditionals

Type 1

While the pro column is correctly sorted in Pandas, the order of the state and anti columns does not entirely match the result in Polars.

PolarsPandas

out_pl = (
    df_pl.lazy()
    .group_by("state")
    .agg(
        (pl.col("party") == "Anti-Administration").sum().alias("anti"),
        (pl.col("party") == "Pro-Administration").sum().alias("pro"),
    )
    .sort("pro", descending=True)
    .limit(5)
    .collect()
)
print(out_pl)

shape: (5, 3)
┌───────┬──────┬─────┐
│ state ┆ anti ┆ pro │
│ ---   ┆ ---  ┆ --- │
│ cat   ┆ u32  ┆ u32 │
╞═══════╪══════╪═════╡
│ CT    ┆ 0    ┆ 3   │
│ NJ    ┆ 0    ┆ 3   │
│ NC    ┆ 1    ┆ 2   │
│ SC    ┆ 0    ┆ 1   │
│ PA    ┆ 1    ┆ 1   │
└───────┴──────┴─────┘

out_pd = (
    df_pd.groupby("state", observed=True)
    .agg(
        anti=("party", lambda s_: (s_ == "Anti-Administration").sum()),
        pro=("party", lambda s_: (s_ == "Pro-Administration").sum()),
    )
    .sort_values("pro", ascending=False)
    .reset_index()
    .head(5)
)
print(out_pd)

  state  anti  pro
0    CT     0    3
1    NJ     0    3
2    NC     1    2
3    SC     0    1
4    PA     1    1

Type 2

While the count column is correctly sorted in Pandas, the order of the state and party columns does not entirely match the result in Polars.

PolarsPandas

out_pl = (
    df_pl.lazy()
    .group_by("state", "party")
    .agg(pl.count("party").alias("count"))
    .filter(
        (pl.col("party") == "Anti-Administration")
        | (pl.col("party") == "Pro-Administration")
    )
    .sort("count", descending=True)
    .limit(5)
    .collect()
)
print(out_pl)

shape: (5, 3)
┌───────┬─────────────────────┬───────┐
│ state ┆ party               ┆ count │
│ ---   ┆ ---                 ┆ ---   │
│ cat   ┆ cat                 ┆ u32   │
╞═══════╪═════════════════════╪═══════╡
│ CT    ┆ Pro-Administration  ┆ 3     │
│ VA    ┆ Anti-Administration ┆ 3     │
│ NJ    ┆ Pro-Administration  ┆ 3     │
│ NC    ┆ Pro-Administration  ┆ 2     │
│ SC    ┆ Pro-Administration  ┆ 1     │
└───────┴─────────────────────┴───────┘

out_pd = (
    df_pd.groupby(["state", "party"], observed=True)
    .agg(count=("party", "size"))
    .query("party == 'Anti-Administration' | party == 'Pro-Administration' ")
    .reset_index()
    .sort_values("count", ascending=False)
    .reset_index(drop=True)
    .head(5)
)
print(out_pd)

  state                party  count
0    CT   Pro-Administration      3
1    NJ   Pro-Administration      3
2    VA  Anti-Administration      3
3    NC   Pro-Administration      2
4    GA  Anti-Administration      1

Filtering

PolarsPandas

def compute_age() -> pl.Expr:
    return date(2021, 1, 1).year - pl.col("birthday").dt.year()


def avg_birthday(gender: str) -> pl.Expr:
    return (
        compute_age()
        .filter(pl.col("gender") == gender)
        .mean()
        .alias(f"avg {gender} birthday")
    )


out_pl = (
    df_pl.lazy()
    .group_by("state")
    .agg(
        avg_birthday("M"),
        avg_birthday("F"),
        (pl.col("gender") == "M").sum().alias("# male"),
        (pl.col("gender") == "F").sum().alias("# female"),
    )
    .sort(pl.col("state").cat.set_ordering("lexical"))
    .head(5)
    .collect()
)
print(out_pl)

shape: (5, 5)
┌───────┬────────────────┬────────────────┬────────┬──────────┐
│ state ┆ avg M birthday ┆ avg F birthday ┆ # male ┆ # female │
│ ---   ┆ ---            ┆ ---            ┆ ---    ┆ ---      │
│ cat   ┆ f64            ┆ f64            ┆ u32    ┆ u32      │
╞═══════╪════════════════╪════════════════╪════════╪══════════╡
│ AK    ┆ 120.411765     ┆ null           ┆ 17     ┆ 0        │
│ AL    ┆ 163.772727     ┆ 97.5           ┆ 207    ┆ 4        │
│ AR    ┆ 154.073394     ┆ 121.4          ┆ 112    ┆ 5        │
│ AS    ┆ 81.0           ┆ null           ┆ 2      ┆ 0        │
│ AZ    ┆ 114.586957     ┆ 76.8           ┆ 46     ┆ 5        │
└───────┴────────────────┴────────────────┴────────┴──────────┘

def _rename_and_reorder_cols(df_):
    df_.columns = ["avg F birthday", "avg M birthday", "# female", "# male"]
    return df_.loc[:, ["avg M birthday", "avg F birthday", "# male", "# female"]]


out_pd = (
    df_pd.groupby(["state", "gender"], observed=True)
    .agg(
        birthday=("birthday", lambda s_: (date(2021, 1, 1).year - s_.dt.year).mean()),
        count=("gender", "size"),
    )
    .unstack(level="gender")
    .pipe(_rename_and_reorder_cols)
    .fillna({"# female": 0, "# male": 0})
    .sort_index()
    .reset_index()
    .head(5)
)
print(out_pd)

  state  avg M birthday  avg F birthday  # male  # female
0    AK      120.411765             NaN    17.0       0.0
1    AL      163.772727            97.5   207.0       4.0
2    AR      154.073394           121.4   112.0       5.0
3    AS       81.000000             NaN     2.0       0.0
4    AZ      114.586957            76.8    46.0       5.0

Sorting

PolarsPandas

def get_person() -> pl.Expr:
    return pl.col("first_name") + pl.lit(" ") + pl.col("last_name")


out_pl = (
    df_pl.lazy()
    .with_columns(pl.col("first_name").cat.set_ordering("lexical"))
    .sort("birthday", descending=True)
    .group_by("state")
    .agg(
        get_person().first().alias("youngest"),
        get_person().last().alias("oldest"),
        get_person().sort().first().alias("alphabetical_first"),
        pl.col("gender").sort_by("first_name").first().alias("gender"),
    )
    .sort(pl.col("state").cat.set_ordering("lexical"))
    .limit(5)
    .collect()
)
print(out_pl)

shape: (5, 5)
┌───────┬──────────────────┬────────────────┬────────────────────┬────────┐
│ state ┆ youngest         ┆ oldest         ┆ alphabetical_first ┆ gender │
│ ---   ┆ ---              ┆ ---            ┆ ---                ┆ ---    │
│ cat   ┆ str              ┆ str            ┆ str                ┆ cat    │
╞═══════╪══════════════════╪════════════════╪════════════════════╪════════╡
│ AK    ┆ Mark Begich      ┆ Thomas Cale    ┆ Anthony Dimond     ┆ M      │
│ AL    ┆ Martha Roby      ┆ John McKee     ┆ Albert Goodwyn     ┆ M      │
│ AR    ┆ Tim Griffin      ┆ Archibald Yell ┆ Albert Rust        ┆ M      │
│ AS    ┆ Eni Faleomavaega ┆ Fofó Sunia     ┆ Eni Faleomavaega   ┆ M      │
│ AZ    ┆ Ben Quayle       ┆ Coles Bashford ┆ Ann Kirkpatrick    ┆ F      │
└───────┴──────────────────┴────────────────┴────────────────────┴────────┘

def get_alpha_info(s_):
    sorted_s = s_.sort_values()
    return "___".join([sorted_s.iloc[0], str(sorted_s.index[0])])


def _create_alpha_cols(df_):
    return (
        df_.alpha_info.str.split("___", expand=True)
        .rename(columns={0: "alphabetical_first", 1: "alpha_index"})
        .assign(alpha_index=lambda df2_: df2_.alpha_index.astype("int64"))
    )


def process_alpha_gender(df_):
    return (
        pd.concat([df_, _create_alpha_cols(df_)], axis="columns")
        .assign(
            gender=lambda df_: df_pd.loc[df_.alpha_index.tolist(), ["gender", "state"]]
            .set_index("state")
            .gender
        )
        .drop(columns=["alpha_info", "alpha_index"])
    )


out_pd = (
    df_pd.assign(
        shown_name=lambda df_: df_.first_name.astype(str)
        + " "
        + df_.last_name.astype(str)
    )
    .sort_values("birthday", ascending=False)
    .groupby("state", observed=True)
    .agg(
        youngest=("shown_name", "first"),
        oldest=("shown_name", "last"),
        alpha_info=("shown_name", get_alpha_info),
    )
    .pipe(process_alpha_gender)
    .sort_index()
    .reset_index()
    .head()
)
print(out_pd)

  state          youngest             oldest alphabetical_first gender
0    AK       Mark Begich        Thomas Cale     Anthony Dimond      M
1    AL       Martha Roby     Thomas Haughey     Albert Goodwyn      M
2    AR       Tim Griffin  William Sebastian        Albert Rust      M
3    AS  Eni Faleomavaega         Fofó Sunia   Eni Faleomavaega      M
4    AZ        Ben Quayle     Coles Bashford    Ann Kirkpatrick      F

Reference

The examples in this section have been adapted from the Polars user guide.