Aggregation
Setup
from datetime import date
import pandas as pd
import polars as pl
pl.enable_string_cache()
pd.set_option("display.max_seq_items", 3)
url = "https://theunitedstates.io/congress-legislators/legislators-historical.csv"
dtypes = {
"first_name": pl.Categorical,
"gender": pl.Categorical,
"type": pl.Categorical,
"state": pl.Categorical,
"party": pl.Categorical,
}
df_pl = pl.read_csv(url, dtypes=dtypes).with_columns(
pl.col("birthday").str.to_date(strict=False)
)
print(df_pl)
shape: (12_139, 36)
┌───────────┬────────────┬─────────────┬────────┬───┬──────────────────┬──────────────────┬──────────┬─────────────────┐
│ last_name ┆ first_name ┆ middle_name ┆ suffix ┆ … ┆ ballotpedia_id ┆ washington_post_ ┆ icpsr_id ┆ wikipedia_id │
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ id ┆ --- ┆ --- │
│ str ┆ cat ┆ str ┆ str ┆ ┆ str ┆ --- ┆ i64 ┆ str │
│ ┆ ┆ ┆ ┆ ┆ ┆ str ┆ ┆ │
╞═══════════╪════════════╪═════════════╪════════╪═══╪══════════════════╪══════════════════╪══════════╪═════════════════╡
│ Bassett ┆ Richard ┆ null ┆ null ┆ … ┆ null ┆ null ┆ 507 ┆ Richard Bassett │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ (Delaware │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ politi… │
│ Bland ┆ Theodorick ┆ null ┆ null ┆ … ┆ null ┆ null ┆ 786 ┆ Theodorick │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Bland │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ (congressman) │
│ Burke ┆ Aedanus ┆ null ┆ null ┆ … ┆ null ┆ null ┆ 1260 ┆ Aedanus Burke │
│ Carroll ┆ Daniel ┆ null ┆ null ┆ … ┆ null ┆ null ┆ 1538 ┆ Daniel Carroll │
│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │
│ Sasse ┆ Benjamin ┆ Eric ┆ null ┆ … ┆ Ben Sasse ┆ null ┆ 41503 ┆ Ben Sasse │
│ Feinstein ┆ Dianne ┆ null ┆ null ┆ … ┆ Dianne Feinstein ┆ null ┆ 49300 ┆ Dianne │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Feinstein │
│ Cicilline ┆ David ┆ N. ┆ null ┆ … ┆ David N. ┆ null ┆ 21172 ┆ David Cicilline │
│ ┆ ┆ ┆ ┆ ┆ Cicilline ┆ ┆ ┆ │
│ Stewart ┆ Chris ┆ null ┆ null ┆ … ┆ Chris Stewart ┆ null ┆ 21367 ┆ Chris Stewart │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ (politician) │
└───────────┴────────────┴─────────────┴────────┴───┴──────────────────┴──────────────────┴──────────┴─────────────────┘
dtype = {
"first_name": "category",
"gender": "category",
"type": "category",
"state": "category",
"party": "category",
}
df_pd = pd.read_csv(url, dtype=dtype).assign(
birthday=lambda df_: pd.to_datetime(df_.birthday, errors="coerce")
)
print(df_pd)
last_name first_name middle_name ... washington_post_id icpsr_id wikipedia_id
0 Bassett Richard NaN ... NaN 507.0 Richard Bassett (Delaware politician)
1 Bland Theodorick NaN ... NaN 786.0 Theodorick Bland (congressman)
2 Burke Aedanus NaN ... NaN 1260.0 Aedanus Burke
3 Carroll Daniel NaN ... NaN 1538.0 Daniel Carroll
4 Clymer George NaN ... NaN 1859.0 George Clymer
... ... ... ... ... ... ... ...
12134 Inhofe James M. ... NaN 15424.0 Jim Inhofe
12135 Sasse Benjamin Eric ... NaN 41503.0 Ben Sasse
12136 Feinstein Dianne NaN ... NaN 49300.0 Dianne Feinstein
12137 Cicilline David N. ... NaN 21172.0 David Cicilline
12138 Stewart Chris NaN ... NaN 21367.0 Chris Stewart (politician)
[12139 rows x 36 columns]
Basic aggregations
out_pl = (
df_pl.lazy()
.group_by("first_name")
.agg(pl.count(), pl.col("gender"), pl.first("last_name"))
.sort("count", descending=True)
.limit(5)
.collect()
)
print(out_pl)
shape: (5, 4)
┌────────────┬───────┬───────────────────┬───────────┐
│ first_name ┆ count ┆ gender ┆ last_name │
│ --- ┆ --- ┆ --- ┆ --- │
│ cat ┆ u32 ┆ list[cat] ┆ str │
╞════════════╪═══════╪═══════════════════╪═══════════╡
│ John ┆ 1256 ┆ ["M", "M", … "M"] ┆ Walker │
│ William ┆ 1022 ┆ ["M", "M", … "M"] ┆ Few │
│ James ┆ 714 ┆ ["M", "M", … "M"] ┆ Armstrong │
│ Thomas ┆ 454 ┆ ["M", "M", … "M"] ┆ Tucker │
│ Charles ┆ 439 ┆ ["M", "M", … "M"] ┆ Carroll │
└────────────┴───────┴───────────────────┴───────────┘
Conditionals
Type 1
While the pro
column is correctly sorted in Pandas
, the order of the state
and anti
columns does not entirely match the result in Polars
.
Type 2
While the count
column is correctly sorted in Pandas
, the order of the state
and party
columns does not entirely match the result in Polars
.
out_pl = (
df_pl.lazy()
.group_by("state", "party")
.agg(pl.count("party").alias("count"))
.filter(
(pl.col("party") == "Anti-Administration")
| (pl.col("party") == "Pro-Administration")
)
.sort("count", descending=True)
.limit(5)
.collect()
)
print(out_pl)
shape: (5, 3)
┌───────┬─────────────────────┬───────┐
│ state ┆ party ┆ count │
│ --- ┆ --- ┆ --- │
│ cat ┆ cat ┆ u32 │
╞═══════╪═════════════════════╪═══════╡
│ CT ┆ Pro-Administration ┆ 3 │
│ VA ┆ Anti-Administration ┆ 3 │
│ NJ ┆ Pro-Administration ┆ 3 │
│ NC ┆ Pro-Administration ┆ 2 │
│ SC ┆ Pro-Administration ┆ 1 │
└───────┴─────────────────────┴───────┘
Filtering
def compute_age() -> pl.Expr:
return date(2021, 1, 1).year - pl.col("birthday").dt.year()
def avg_birthday(gender: str) -> pl.Expr:
return (
compute_age()
.filter(pl.col("gender") == gender)
.mean()
.alias(f"avg {gender} birthday")
)
out_pl = (
df_pl.lazy()
.group_by("state")
.agg(
avg_birthday("M"),
avg_birthday("F"),
(pl.col("gender") == "M").sum().alias("# male"),
(pl.col("gender") == "F").sum().alias("# female"),
)
.sort(pl.col("state").cat.set_ordering("lexical"))
.head(5)
.collect()
)
print(out_pl)
shape: (5, 5)
┌───────┬────────────────┬────────────────┬────────┬──────────┐
│ state ┆ avg M birthday ┆ avg F birthday ┆ # male ┆ # female │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ cat ┆ f64 ┆ f64 ┆ u32 ┆ u32 │
╞═══════╪════════════════╪════════════════╪════════╪══════════╡
│ AK ┆ 120.411765 ┆ null ┆ 17 ┆ 0 │
│ AL ┆ 163.772727 ┆ 97.5 ┆ 207 ┆ 4 │
│ AR ┆ 154.073394 ┆ 121.4 ┆ 112 ┆ 5 │
│ AS ┆ 81.0 ┆ null ┆ 2 ┆ 0 │
│ AZ ┆ 114.586957 ┆ 76.8 ┆ 46 ┆ 5 │
└───────┴────────────────┴────────────────┴────────┴──────────┘
def _rename_and_reorder_cols(df_):
df_.columns = ["avg F birthday", "avg M birthday", "# female", "# male"]
return df_.loc[:, ["avg M birthday", "avg F birthday", "# male", "# female"]]
out_pd = (
df_pd.groupby(["state", "gender"], observed=True)
.agg(
birthday=("birthday", lambda s_: (date(2021, 1, 1).year - s_.dt.year).mean()),
count=("gender", "size"),
)
.unstack(level="gender")
.pipe(_rename_and_reorder_cols)
.fillna({"# female": 0, "# male": 0})
.sort_index()
.reset_index()
.head(5)
)
print(out_pd)
Sorting
def get_person() -> pl.Expr:
return pl.col("first_name") + pl.lit(" ") + pl.col("last_name")
out_pl = (
df_pl.lazy()
.with_columns(pl.col("first_name").cat.set_ordering("lexical"))
.sort("birthday", descending=True)
.group_by("state")
.agg(
get_person().first().alias("youngest"),
get_person().last().alias("oldest"),
get_person().sort().first().alias("alphabetical_first"),
pl.col("gender").sort_by("first_name").first().alias("gender"),
)
.sort(pl.col("state").cat.set_ordering("lexical"))
.limit(5)
.collect()
)
print(out_pl)
shape: (5, 5)
┌───────┬──────────────────┬────────────────┬────────────────────┬────────┐
│ state ┆ youngest ┆ oldest ┆ alphabetical_first ┆ gender │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ cat ┆ str ┆ str ┆ str ┆ cat │
╞═══════╪══════════════════╪════════════════╪════════════════════╪════════╡
│ AK ┆ Mark Begich ┆ Thomas Cale ┆ Anthony Dimond ┆ M │
│ AL ┆ Martha Roby ┆ John McKee ┆ Albert Goodwyn ┆ M │
│ AR ┆ Tim Griffin ┆ Archibald Yell ┆ Albert Rust ┆ M │
│ AS ┆ Eni Faleomavaega ┆ Fofó Sunia ┆ Eni Faleomavaega ┆ M │
│ AZ ┆ Ben Quayle ┆ Coles Bashford ┆ Ann Kirkpatrick ┆ F │
└───────┴──────────────────┴────────────────┴────────────────────┴────────┘
def get_alpha_info(s_):
sorted_s = s_.sort_values()
return "___".join([sorted_s.iloc[0], str(sorted_s.index[0])])
def _create_alpha_cols(df_):
return (
df_.alpha_info.str.split("___", expand=True)
.rename(columns={0: "alphabetical_first", 1: "alpha_index"})
.assign(alpha_index=lambda df2_: df2_.alpha_index.astype("int64"))
)
def process_alpha_gender(df_):
return (
pd.concat([df_, _create_alpha_cols(df_)], axis="columns")
.assign(
gender=lambda df_: df_pd.loc[df_.alpha_index.tolist(), ["gender", "state"]]
.set_index("state")
.gender
)
.drop(columns=["alpha_info", "alpha_index"])
)
out_pd = (
df_pd.assign(
shown_name=lambda df_: df_.first_name.astype(str)
+ " "
+ df_.last_name.astype(str)
)
.sort_values("birthday", ascending=False)
.groupby("state", observed=True)
.agg(
youngest=("shown_name", "first"),
oldest=("shown_name", "last"),
alpha_info=("shown_name", get_alpha_info),
)
.pipe(process_alpha_gender)
.sort_index()
.reset_index()
.head()
)
print(out_pd)
Reference
The examples in this section have been adapted from the Polars
user guide.