Skip to content

List

Powerful List manipulation

Setup

import pandas as pd
import polars as pl

data = {
    "station": ["Station " + str(x) for x in range(1, 6)],
    "temperatures": [
        "20 5 5 E1 7 13 19 9 6 20",
        "18 8 16 11 23 E2 8 E2 E2 E2 90 70 40",
        "19 24 E9 16 6 12 10 22",
        "E2 E0 15 7 8 10 E1 24 17 13 6",
        "14 8 E0 16 22 24 E1",
    ],
}

pd.set_option("display.max_seq_items", 3)

df_pl = pl.DataFrame(data)
print(df_pl)

shape: (5, 2)
┌───────────┬───────────────────────────────────┐
│ station   ┆ temperatures                      │
│ ---       ┆ ---                               │
│ str       ┆ str                               │
╞═══════════╪═══════════════════════════════════╡
│ Station 1 ┆ 20 5 5 E1 7 13 19 9 6 20          │
│ Station 2 ┆ 18 8 16 11 23 E2 8 E2 E2 E2 90 7… │
│ Station 3 ┆ 19 24 E9 16 6 12 10 22            │
│ Station 4 ┆ E2 E0 15 7 8 10 E1 24 17 13 6     │
│ Station 5 ┆ 14 8 E0 16 22 24 E1               │
└───────────┴───────────────────────────────────┘

df_pd = pd.DataFrame(data)
print(df_pd)

     station                          temperatures
0  Station 1              20 5 5 E1 7 13 19 9 6 20
1  Station 2  18 8 16 11 23 E2 8 E2 E2 E2 90 70 40
2  Station 3                19 24 E9 16 6 12 10 22
3  Station 4         E2 E0 15 7 8 10 E1 24 17 13 6
4  Station 5                   14 8 E0 16 22 24 E1

Creating a List column

out_pl = df_pl.with_columns(pl.col("temperatures").str.split(" "))
print(out_pl)

shape: (5, 2)
┌───────────┬──────────────────────┐
│ station   ┆ temperatures         │
│ ---       ┆ ---                  │
│ str       ┆ list[str]            │
╞═══════════╪══════════════════════╡
│ Station 1 ┆ ["20", "5", … "20"]  │
│ Station 2 ┆ ["18", "8", … "40"]  │
│ Station 3 ┆ ["19", "24", … "22"] │
│ Station 4 ┆ ["E2", "E0", … "6"]  │
│ Station 5 ┆ ["14", "8", … "E1"]  │
└───────────┴──────────────────────┘

out_pd = df_pd.assign(temperatures=lambda df_: df_.temperatures.str.split())
print(out_pd)

     station       temperatures
0  Station 1    [20, 5, 5, ...]
1  Station 2   [18, 8, 16, ...]
2  Station 3  [19, 24, E9, ...]
3  Station 4  [E2, E0, 15, ...]
4  Station 5   [14, 8, E0, ...]

Operating on List columns

out_pl = df_pl.with_columns(pl.col("temperatures").str.split(" ")).with_columns(
    pl.col("temperatures").list.head(3).alias("top3"),
    pl.col("temperatures").list.slice(-3, 3).alias("bottom_3"),
    pl.col("temperatures").list.len().alias("obs"),
)
print(out_pl)

shape: (5, 5)
┌───────────┬──────────────────────┬────────────────────┬────────────────────┬─────┐
│ station   ┆ temperatures         ┆ top3               ┆ bottom_3           ┆ obs │
│ ---       ┆ ---                  ┆ ---                ┆ ---                ┆ --- │
│ str       ┆ list[str]            ┆ list[str]          ┆ list[str]          ┆ u32 │
╞═══════════╪══════════════════════╪════════════════════╪════════════════════╪═════╡
│ Station 1 ┆ ["20", "5", … "20"]  ┆ ["20", "5", "5"]   ┆ ["9", "6", "20"]   ┆ 10  │
│ Station 2 ┆ ["18", "8", … "40"]  ┆ ["18", "8", "16"]  ┆ ["90", "70", "40"] ┆ 13  │
│ Station 3 ┆ ["19", "24", … "22"] ┆ ["19", "24", "E9"] ┆ ["12", "10", "22"] ┆ 8   │
│ Station 4 ┆ ["E2", "E0", … "6"]  ┆ ["E2", "E0", "15"] ┆ ["17", "13", "6"]  ┆ 11  │
│ Station 5 ┆ ["14", "8", … "E1"]  ┆ ["14", "8", "E0"]  ┆ ["22", "24", "E1"] ┆ 7   │
└───────────┴──────────────────────┴────────────────────┴────────────────────┴─────┘

from operator import itemgetter  # noqa: E402

out_pd = df_pd.assign(
    temperatures=lambda df_: df_.temperatures.str.split(),
    top3=lambda df_: df_.temperatures.apply(itemgetter(slice(None, 3))),
    bottom_3=lambda df_: df_.temperatures.apply(itemgetter(slice(-3, None))),
    obs=lambda df_: df_.temperatures.apply(len),
)
print(out_pd)

     station       temperatures          top3      bottom_3  obs
0  Station 1    [20, 5, 5, ...]    [20, 5, 5]    [9, 6, 20]   10
1  Station 2   [18, 8, 16, ...]   [18, 8, 16]  [90, 70, 40]   13
2  Station 3  [19, 24, E9, ...]  [19, 24, E9]  [12, 10, 22]    8
3  Station 4  [E2, E0, 15, ...]  [E2, E0, 15]   [17, 13, 6]   11
4  Station 5   [14, 8, E0, ...]   [14, 8, E0]  [22, 24, E1]    7

Element-wise computation within Lists

out_pl = df_pl.with_columns(
    pl.col("temperatures")
    .str.split(" ")
    .list.eval(pl.element().cast(pl.Int64, strict=False).is_null())
    .list.sum()
    .alias("errors")
)
print(out_pl)

shape: (5, 3)
┌───────────┬───────────────────────────────────┬────────┐
│ station   ┆ temperatures                      ┆ errors │
│ ---       ┆ ---                               ┆ ---    │
│ str       ┆ str                               ┆ u32    │
╞═══════════╪═══════════════════════════════════╪════════╡
│ Station 1 ┆ 20 5 5 E1 7 13 19 9 6 20          ┆ 1      │
│ Station 2 ┆ 18 8 16 11 23 E2 8 E2 E2 E2 90 7… ┆ 4      │
│ Station 3 ┆ 19 24 E9 16 6 12 10 22            ┆ 1      │
│ Station 4 ┆ E2 E0 15 7 8 10 E1 24 17 13 6     ┆ 3      │
│ Station 5 ┆ 14 8 E0 16 22 24 E1               ┆ 2      │
└───────────┴───────────────────────────────────┴────────┘

def eval_sum(v):
    return pd.to_numeric(pd.Series(v), errors="coerce").isna().sum()


out_pd = df_pd.assign(errors=lambda df_: df_.temperatures.str.split().apply(eval_sum))
print(out_pd)

     station                          temperatures  errors
0  Station 1              20 5 5 E1 7 13 19 9 6 20       1
1  Station 2  18 8 16 11 23 E2 8 E2 E2 E2 90 70 40       4
2  Station 3                19 24 E9 16 6 12 10 22       1
3  Station 4         E2 E0 15 7 8 10 E1 24 17 13 6       3
4  Station 5                   14 8 E0 16 22 24 E1       2

Row-wise computations

data2 = {
    "station": ["Station " + str(x) for x in range(1, 11)],
    "day_1": [17, 11, 8, 22, 9, 21, 20, 8, 8, 17],
    "day_2": [15, 11, 10, 8, 7, 14, 18, 21, 15, 13],
    "day_3": [16, 15, 24, 24, 8, 23, 19, 23, 16, 10],
}

df_pl = pl.DataFrame(data2)
print(df_pl)

shape: (10, 4)
┌────────────┬───────┬───────┬───────┐
│ station    ┆ day_1 ┆ day_2 ┆ day_3 │
│ ---        ┆ ---   ┆ ---   ┆ ---   │
│ str        ┆ i64   ┆ i64   ┆ i64   │
╞════════════╪═══════╪═══════╪═══════╡
│ Station 1  ┆ 17    ┆ 15    ┆ 16    │
│ Station 2  ┆ 11    ┆ 11    ┆ 15    │
│ Station 3  ┆ 8     ┆ 10    ┆ 24    │
│ Station 4  ┆ 22    ┆ 8     ┆ 24    │
│ …          ┆ …     ┆ …     ┆ …     │
│ Station 7  ┆ 20    ┆ 18    ┆ 19    │
│ Station 8  ┆ 8     ┆ 21    ┆ 23    │
│ Station 9  ┆ 8     ┆ 15    ┆ 16    │
│ Station 10 ┆ 17    ┆ 13    ┆ 10    │
└────────────┴───────┴───────┴───────┘

df_pd = pd.DataFrame(data2)
print(df_pd)

      station  day_1  day_2  day_3
0   Station 1     17     15     16
1   Station 2     11     11     15
2   Station 3      8     10     24
3   Station 4     22      8     24
4   Station 5      9      7      8
5   Station 6     21     14     23
6   Station 7     20     18     19
7   Station 8      8     21     23
8   Station 9      8     15     16
9  Station 10     17     13     10

rank_pct = (pl.element().rank(descending=True) / pl.col("*").count()).round(2)


out_pl = df_pl.with_columns(
    pl.concat_list(pl.all().exclude("station")).alias("all_temps")
).select(
    pl.all().exclude("all_temps"),
    pl.col("all_temps").list.eval(rank_pct, parallel=True).alias("temps_rank"),
)
print(out_pl)

shape: (10, 5)
┌────────────┬───────┬───────┬───────┬────────────────────┐
│ station    ┆ day_1 ┆ day_2 ┆ day_3 ┆ temps_rank         │
│ ---        ┆ ---   ┆ ---   ┆ ---   ┆ ---                │
│ str        ┆ i64   ┆ i64   ┆ i64   ┆ list[f64]          │
╞════════════╪═══════╪═══════╪═══════╪════════════════════╡
│ Station 1  ┆ 17    ┆ 15    ┆ 16    ┆ [0.33, 1.0, 0.67]  │
│ Station 2  ┆ 11    ┆ 11    ┆ 15    ┆ [0.83, 0.83, 0.33] │
│ Station 3  ┆ 8     ┆ 10    ┆ 24    ┆ [1.0, 0.67, 0.33]  │
│ Station 4  ┆ 22    ┆ 8     ┆ 24    ┆ [0.67, 1.0, 0.33]  │
│ …          ┆ …     ┆ …     ┆ …     ┆ …                  │
│ Station 7  ┆ 20    ┆ 18    ┆ 19    ┆ [0.33, 1.0, 0.67]  │
│ Station 8  ┆ 8     ┆ 21    ┆ 23    ┆ [1.0, 0.67, 0.33]  │
│ Station 9  ┆ 8     ┆ 15    ┆ 16    ┆ [1.0, 0.67, 0.33]  │
│ Station 10 ┆ 17    ┆ 13    ┆ 10    ┆ [0.33, 0.67, 1.0]  │
└────────────┴───────┴───────┴───────┴────────────────────┘

def cal_rank_pct(df_):
    n_days = df_.columns.drop(["station"]).size
    return (
        df_.drop(columns=["station"])
        .rank(axis="columns", ascending=False)
        .div(n_days)
        .round(2)
        .agg(list, axis="columns")
    )


out_pd = df_pd.assign(temps_rank=cal_rank_pct)
print(out_pd)

      station  day_1  day_2  day_3          temps_rank
0   Station 1     17     15     16   [0.33, 1.0, 0.67]
1   Station 2     11     11     15  [0.83, 0.83, 0.33]
2   Station 3      8     10     24   [1.0, 0.67, 0.33]
3   Station 4     22      8     24   [0.67, 1.0, 0.33]
4   Station 5      9      7      8   [0.33, 1.0, 0.67]
5   Station 6     21     14     23   [0.67, 1.0, 0.33]
6   Station 7     20     18     19   [0.33, 1.0, 0.67]
7   Station 8      8     21     23   [1.0, 0.67, 0.33]
8   Station 9      8     15     16   [1.0, 0.67, 0.33]
9  Station 10     17     13     10   [0.33, 0.67, 1.0]

Reference

The examples in this section have been adapted from the Polars user guide.