Skip to content

Structs

While Struct is a unique feature in Polars, emulating its behavior in Pandas might initially feel unfamiliar. Nonetheless, I made an effort to replicate this functionality in Pandas to appreciate its elegance.

Setup

import pandas as pd
import polars as pl

data = {
    "Movie": ["Cars", "IT", "ET", "Cars", "Up", "IT", "Cars", "ET", "Up", "ET"],
    "Theatre": ["NE", "ME", "IL", "ND", "NE", "SD", "NE", "IL", "IL", "SD"],
    "Avg_Rating": [4.5, 4.4, 4.6, 4.3, 4.8, 4.7, 4.7, 4.9, 4.7, 4.6],
    "Count": [30, 27, 26, 29, 31, 28, 28, 26, 33, 26],
}

df_pl = pl.DataFrame(data)
print(df_pl)

shape: (10, 4)
┌───────┬─────────┬────────────┬───────┐
│ Movie ┆ Theatre ┆ Avg_Rating ┆ Count │
│ ---   ┆ ---     ┆ ---        ┆ ---   │
│ str   ┆ str     ┆ f64        ┆ i64   │
╞═══════╪═════════╪════════════╪═══════╡
│ Cars  ┆ NE      ┆ 4.5        ┆ 30    │
│ IT    ┆ ME      ┆ 4.4        ┆ 27    │
│ ET    ┆ IL      ┆ 4.6        ┆ 26    │
│ Cars  ┆ ND      ┆ 4.3        ┆ 29    │
│ …     ┆ …       ┆ …          ┆ …     │
│ Cars  ┆ NE      ┆ 4.7        ┆ 28    │
│ ET    ┆ IL      ┆ 4.9        ┆ 26    │
│ Up    ┆ IL      ┆ 4.7        ┆ 33    │
│ ET    ┆ SD      ┆ 4.6        ┆ 26    │
└───────┴─────────┴────────────┴───────┘

df_pd = pd.DataFrame(data)
print(df_pd)

  Movie Theatre  Avg_Rating  Count
0  Cars      NE         4.5     30
1    IT      ME         4.4     27
2    ET      IL         4.6     26
3  Cars      ND         4.3     29
4    Up      NE         4.8     31
5    IT      SD         4.7     28
6  Cars      NE         4.7     28
7    ET      IL         4.9     26
8    Up      IL         4.7     33
9    ET      SD         4.6     26

Encountering the Struct type

Nested Struct

out_pl = df_pl.select(pl.col("Theatre").value_counts(sort=True))
print(out_pl)

shape: (5, 1)
┌───────────┐
│ Theatre   │
│ ---       │
│ struct[2] │
╞═══════════╡
│ {"NE",3}  │
│ {"IL",3}  │
│ {"SD",2}  │
│ {"ME",1}  │
│ {"ND",1}  │
└───────────┘

out_pd = (
    df_pd.loc[:, ["Theatre"]]
    .value_counts()
    .reset_index()
    .assign(Theatre=lambda df_: df_.values.tolist())
    .drop(columns=["count"])
)
print(out_pd)

   Theatre
0  [IL, 3]
1  [NE, 3]
2  [SD, 2]
3  [ND, 1]
4  [ME, 1]

Unnested Struct

out_pl = df_pl.select(pl.col("Theatre").value_counts(sort=True)).unnest("Theatre")
print(out_pl)

shape: (5, 2)
┌─────────┬────────┐
│ Theatre ┆ counts │
│ ---     ┆ ---    │
│ str     ┆ u32    │
╞═════════╪════════╡
│ NE      ┆ 3      │
│ IL      ┆ 3      │
│ SD      ┆ 2      │
│ ME      ┆ 1      │
│ ND      ┆ 1      │
└─────────┴────────┘

out_pd = df_pd.loc[:, ["Theatre"]].value_counts()
print(out_pd)

Theatre
IL         3
NE         3
SD         2
ND         1
ME         1
Name: count, dtype: int64

Identifying duplicate rows

out_pl = df_pl.filter(pl.struct("Movie", "Theatre").is_duplicated())
print(out_pl)

shape: (4, 4)
┌───────┬─────────┬────────────┬───────┐
│ Movie ┆ Theatre ┆ Avg_Rating ┆ Count │
│ ---   ┆ ---     ┆ ---        ┆ ---   │
│ str   ┆ str     ┆ f64        ┆ i64   │
╞═══════╪═════════╪════════════╪═══════╡
│ Cars  ┆ NE      ┆ 4.5        ┆ 30    │
│ ET    ┆ IL      ┆ 4.6        ┆ 26    │
│ Cars  ┆ NE      ┆ 4.7        ┆ 28    │
│ ET    ┆ IL      ┆ 4.9        ┆ 26    │
└───────┴─────────┴────────────┴───────┘

out_pd = df_pd[df_pd.duplicated(["Movie", "Theatre"], keep=False)]
print(out_pd)

  Movie Theatre  Avg_Rating  Count
0  Cars      NE         4.5     30
2    ET      IL         4.6     26
6  Cars      NE         4.7     28
7    ET      IL         4.9     26

Multi-column ranking

out_pl = df_pl.with_columns(
    pl.struct("Count", "Avg_Rating")
    .rank("dense", descending=True)
    .over("Movie", "Theatre")
    .alias("Rank")
).filter(pl.struct("Movie", "Theatre").is_duplicated())
print(out_pl)

shape: (4, 5)
┌───────┬─────────┬────────────┬───────┬──────┐
│ Movie ┆ Theatre ┆ Avg_Rating ┆ Count ┆ Rank │
│ ---   ┆ ---     ┆ ---        ┆ ---   ┆ ---  │
│ str   ┆ str     ┆ f64        ┆ i64   ┆ u32  │
╞═══════╪═════════╪════════════╪═══════╪══════╡
│ Cars  ┆ NE      ┆ 4.5        ┆ 30    ┆ 1    │
│ ET    ┆ IL      ┆ 4.6        ┆ 26    ┆ 2    │
│ Cars  ┆ NE      ┆ 4.7        ┆ 28    ┆ 2    │
│ ET    ┆ IL      ┆ 4.9        ┆ 26    ┆ 1    │
└───────┴─────────┴────────────┴───────┴──────┘

def _create_rank_col(df_):
    to_be_ranked_cols = ["Count", "Avg_Rating"]
    return (
        df_.assign(rank=lambda df_: df_[to_be_ranked_cols].values.tolist())
        .groupby(["Movie", "Theatre"])
        .rank(ascending=False, method="dense")
        .drop(columns=to_be_ranked_cols)
    )


out_pd = df_pd.assign(rank=_create_rank_col)[
    df_pd.duplicated(["Movie", "Theatre"], keep=False)
]
print(out_pd)

  Movie Theatre  Avg_Rating  Count  rank
0  Cars      NE         4.5     30   1.0
2    ET      IL         4.6     26   2.0
6  Cars      NE         4.7     28   2.0
7    ET      IL         4.9     26   1.0

Reference

The examples in this section have been adapted from the Polars user guide.