Skip to content

Functions

Setup

import numpy as np
import pandas as pd
import polars as pl

np.random.seed(42)
data = {
    "nrs": [1, 2, 3, 4, 5],
    "names": ["foo", "ham", "spam", "egg", "baz"],
    "random": np.random.rand(5),
    "groups": ["A", "A", "B", "C", "B"],
}

df_pl = pl.DataFrame(data)
print(df_pl)

shape: (5, 4)
┌─────┬───────┬──────────┬────────┐
│ nrs ┆ names ┆ random   ┆ groups │
│ --- ┆ ---   ┆ ---      ┆ ---    │
│ i64 ┆ str   ┆ f64      ┆ str    │
╞═════╪═══════╪══════════╪════════╡
│ 1   ┆ foo   ┆ 0.37454  ┆ A      │
│ 2   ┆ ham   ┆ 0.950714 ┆ A      │
│ 3   ┆ spam  ┆ 0.731994 ┆ B      │
│ 4   ┆ egg   ┆ 0.598658 ┆ C      │
│ 5   ┆ baz   ┆ 0.156019 ┆ B      │
└─────┴───────┴──────────┴────────┘

df_pd = pd.DataFrame(data)
print(df_pd)

   nrs names    random groups
0    1   foo  0.374540      A
1    2   ham  0.950714      A
2    3  spam  0.731994      B
3    4   egg  0.598658      C
4    5   baz  0.156019      B

Column naming

out_pl = df_pl.select(
    (pl.col("nrs") + 5).alias("nrs + 5"), (pl.col("nrs") - 5).alias("nrs - 5")
)
print(out_pl)

shape: (5, 2)
┌─────────┬─────────┐
│ nrs + 5 ┆ nrs - 5 │
│ ---     ┆ ---     │
│ i64     ┆ i64     │
╞═════════╪═════════╡
│ 6       ┆ -4      │
│ 7       ┆ -3      │
│ 8       ┆ -2      │
│ 9       ┆ -1      │
│ 10      ┆ 0       │
└─────────┴─────────┘

out_pd = df_pd.assign(
    **{"nrs + 5": lambda df_: df_.nrs + 5, "nrs - 5": lambda df_: df_.nrs - 5}
).drop(columns=df_pd.columns)
print(out_pd)

   nrs + 5  nrs - 5
0        6       -4
1        7       -3
2        8       -2
3        9       -1
4       10        0

Count unique values

In Pandas, it appears that there is no built-in method for approximating the count of unique values.

out_pl = df_pl.select(
    pl.col("names").n_unique().alias("unique"),
    pl.approx_n_unique("names").alias("unique_approx"),
)
print(out_pl)

shape: (1, 2)
┌────────┬───────────────┐
│ unique ┆ unique_approx │
│ ---    ┆ ---           │
│ u32    ┆ u32           │
╞════════╪═══════════════╡
│ 5      ┆ 5             │
└────────┴───────────────┘

out_pd = df_pd.names.to_frame().agg(unique=("names", lambda s: s.unique().size)).T
print(out_pd)

       unique
names       5

Conditionals

out_pl = df_pl.select(
    pl.col("nrs"),
    pl.when(pl.col("nrs") > 2)
    .then(pl.lit(True))
    .otherwise(pl.lit(False))
    .alias("conditional"),
)
print(out_pl)

shape: (5, 2)
┌─────┬─────────────┐
│ nrs ┆ conditional │
│ --- ┆ ---         │
│ i64 ┆ bool        │
╞═════╪═════════════╡
│ 1   ┆ false       │
│ 2   ┆ false       │
│ 3   ┆ true        │
│ 4   ┆ true        │
│ 5   ┆ true        │
└─────┴─────────────┘

out_pd = df_pd.assign(conditional=lambda df_: np.where(df_.nrs > 2, True, False)).drop(
    columns=df_pd.columns.drop("nrs")
)
print(out_pd)

   nrs  conditional
0    1        False
1    2        False
2    3         True
3    4         True
4    5         True

Reference

The examples in this section have been adapted from the Polars user guide.