Skip to content

Column selections

Setup

from datetime import date, datetime

import numpy as np
import pandas as pd
import polars as pl
import polars.selectors as cs

df_pl = pl.DataFrame(
    {
        "id": [9, 4, 2],
        "place": ["Mars", "Earth", "Saturn"],
        "date": pl.date_range(date(2022, 1, 1), date(2022, 1, 3), "1d", eager=True),
        "sales": [33.4, 2142134.1, 44.7],
        "has_people": [False, True, False],
        "logged_at": pl.datetime_range(
            datetime(2022, 12, 1), datetime(2022, 12, 1, 0, 0, 2), "1s", eager=True
        ),
    }
).with_row_count("rn")
print(df_pl)

shape: (3, 7)
┌─────┬─────┬────────┬────────────┬───────────┬────────────┬─────────────────────┐
│ rn  ┆ id  ┆ place  ┆ date       ┆ sales     ┆ has_people ┆ logged_at           │
│ --- ┆ --- ┆ ---    ┆ ---        ┆ ---       ┆ ---        ┆ ---                 │
│ u32 ┆ i64 ┆ str    ┆ date       ┆ f64       ┆ bool       ┆ datetime[μs]        │
╞═════╪═════╪════════╪════════════╪═══════════╪════════════╪═════════════════════╡
│ 0   ┆ 9   ┆ Mars   ┆ 2022-01-01 ┆ 33.4      ┆ false      ┆ 2022-12-01 00:00:00 │
│ 1   ┆ 4   ┆ Earth  ┆ 2022-01-02 ┆ 2142134.1 ┆ true       ┆ 2022-12-01 00:00:01 │
│ 2   ┆ 2   ┆ Saturn ┆ 2022-01-03 ┆ 44.7      ┆ false      ┆ 2022-12-01 00:00:02 │
└─────┴─────┴────────┴────────────┴───────────┴────────────┴─────────────────────┘

df_pd = (
    pd.DataFrame(
        {
            "id": [9, 4, 2],
            "place": ["Mars", "Earth", "Saturn"],
            "date": pd.date_range("2022-01-01", "2022-01-03"),
            "sales": [33.4, 2142134.1, 44.7],
            "has_people": [False, True, False],
            "logged_at": pd.date_range("2022-12-01", "2022-12-01 00:00:02", freq="S"),
        }
    )
    .rename_axis("rn")
    .reset_index()
)
print(df_pd)

   rn  id   place       date      sales  has_people           logged_at
0   0   9    Mars 2022-01-01       33.4       False 2022-12-01 00:00:00
1   1   4   Earth 2022-01-02  2142134.1        True 2022-12-01 00:00:01
2   2   2  Saturn 2022-01-03       44.7       False 2022-12-01 00:00:02

Expression expansion

Select all

out_pl = df_pl.select(pl.all())  # or df_pl.select(pl.col("*"))
print(out_pl)

shape: (3, 7)
┌─────┬─────┬────────┬────────────┬───────────┬────────────┬─────────────────────┐
│ rn  ┆ id  ┆ place  ┆ date       ┆ sales     ┆ has_people ┆ logged_at           │
│ --- ┆ --- ┆ ---    ┆ ---        ┆ ---       ┆ ---        ┆ ---                 │
│ u32 ┆ i64 ┆ str    ┆ date       ┆ f64       ┆ bool       ┆ datetime[μs]        │
╞═════╪═════╪════════╪════════════╪═══════════╪════════════╪═════════════════════╡
│ 0   ┆ 9   ┆ Mars   ┆ 2022-01-01 ┆ 33.4      ┆ false      ┆ 2022-12-01 00:00:00 │
│ 1   ┆ 4   ┆ Earth  ┆ 2022-01-02 ┆ 2142134.1 ┆ true       ┆ 2022-12-01 00:00:01 │
│ 2   ┆ 2   ┆ Saturn ┆ 2022-01-03 ┆ 44.7      ┆ false      ┆ 2022-12-01 00:00:02 │
└─────┴─────┴────────┴────────────┴───────────┴────────────┴─────────────────────┘

out_pd = df_pd.loc[:, :]  # or df_pd.iloc[:, :]
print(out_pd)

   rn  id   place       date      sales  has_people           logged_at
0   0   9    Mars 2022-01-01       33.4       False 2022-12-01 00:00:00
1   1   4   Earth 2022-01-02  2142134.1        True 2022-12-01 00:00:01
2   2   2  Saturn 2022-01-03       44.7       False 2022-12-01 00:00:02

Exclude

out_pl = df_pl.select(pl.all().exclude("logged_at", "rn"))
print(out_pl)

shape: (3, 5)
┌─────┬────────┬────────────┬───────────┬────────────┐
│ id  ┆ place  ┆ date       ┆ sales     ┆ has_people │
│ --- ┆ ---    ┆ ---        ┆ ---       ┆ ---        │
│ i64 ┆ str    ┆ date       ┆ f64       ┆ bool       │
╞═════╪════════╪════════════╪═══════════╪════════════╡
│ 9   ┆ Mars   ┆ 2022-01-01 ┆ 33.4      ┆ false      │
│ 4   ┆ Earth  ┆ 2022-01-02 ┆ 2142134.1 ┆ true       │
│ 2   ┆ Saturn ┆ 2022-01-03 ┆ 44.7      ┆ false      │
└─────┴────────┴────────────┴───────────┴────────────┘

out_pd = df_pd.drop(["logged_at", "rn"], axis="columns")
print(out_pd)

   id   place       date      sales  has_people
0   9    Mars 2022-01-01       33.4       False
1   4   Earth 2022-01-02  2142134.1        True
2   2  Saturn 2022-01-03       44.7       False

By multiple strings

out_pl = df_pl.select(pl.col("date", "logged_at").dt.to_string("%Y-%h-%d"))
print(out_pl)

shape: (3, 2)
┌─────────────┬─────────────┐
│ date        ┆ logged_at   │
│ ---         ┆ ---         │
│ str         ┆ str         │
╞═════════════╪═════════════╡
│ 2022-Jan-01 ┆ 2022-Dec-01 │
│ 2022-Jan-02 ┆ 2022-Dec-01 │
│ 2022-Jan-03 ┆ 2022-Dec-01 │
└─────────────┴─────────────┘

out_pd = df_pd.loc[:, ["date", "logged_at"]].assign(
    date=lambda df_: df_.date.dt.strftime("%Y-%h-%d"),
    logged_at=lambda df_: df_.logged_at.dt.strftime("%Y-%h-%d"),
)
print(out_pd)

          date    logged_at
0  2022-Jan-01  2022-Dec-01
1  2022-Jan-02  2022-Dec-01
2  2022-Jan-03  2022-Dec-01

If there are dozens of columns that need manipulation, I will use the following approach instead.

columns = df_pd.select_dtypes("datetime").columns
out_pd = df_pd.loc[:, columns].assign(
    **{col: lambda df_: df_[col].dt.strftime("%Y-%h-%d") for col in columns}
)
print(out_pd)

          date    logged_at
0  2022-Dec-01  2022-Dec-01
1  2022-Dec-01  2022-Dec-01
2  2022-Dec-01  2022-Dec-01

By regular expressions

out_pl = df_pl.select(pl.col("^.*(as|sa).*$"))
print(out_pl)

shape: (3, 2)
┌───────────┬────────────┐
│ sales     ┆ has_people │
│ ---       ┆ ---        │
│ f64       ┆ bool       │
╞═══════════╪════════════╡
│ 33.4      ┆ false      │
│ 2142134.1 ┆ true       │
│ 44.7      ┆ false      │
└───────────┴────────────┘

out_pd = df_pd.filter(regex="^.*(as|sa).*$")
print(out_pd)

       sales  has_people
0       33.4       False
1  2142134.1        True
2       44.7       False

By data type

out_pl = df_pl.select(pl.col(pl.Int64, pl.UInt32, pl.Boolean).n_unique())
print(out_pl)

shape: (1, 3)
┌─────┬─────┬────────────┐
│ rn  ┆ id  ┆ has_people │
│ --- ┆ --- ┆ ---        │
│ u32 ┆ u32 ┆ u32        │
╞═════╪═════╪════════════╡
│ 3   ┆ 3   ┆ 2          │
└─────┴─────┴────────────┘

out_pd = (
    df_pd.select_dtypes(["int64", "bool"]).agg(lambda s_: s_.unique().size).to_frame().T
)
print(out_pd.dtypes, end="\n" * 2)
print(out_pd)

rn            int64
id            int64
has_people    int64
dtype: object

   rn  id  has_people
0   3   3           2

Using selectors

selectors is a unique feature of Polars. It behaves similarly to a combination of df.select_dtypes() and df.filter() in Pandas.

By dtype

out_pl = df_pl.select(cs.integer(), cs.string())
print(out_pl)

shape: (3, 3)
┌─────┬─────┬────────┐
│ rn  ┆ id  ┆ place  │
│ --- ┆ --- ┆ ---    │
│ u32 ┆ i64 ┆ str    │
╞═════╪═════╪════════╡
│ 0   ┆ 9   ┆ Mars   │
│ 1   ┆ 4   ┆ Earth  │
│ 2   ┆ 2   ┆ Saturn │
└─────┴─────┴────────┘

out_pd = df_pd.select_dtypes(["int64", "object"])
print(out_pd.dtypes, end="\n" * 2)
print(out_pd)

rn        int64
id        int64
place    object
dtype: object

   rn  id   place
0   0   9    Mars
1   1   4   Earth
2   2   2  Saturn

Applying set operations

out_pl = df_pl.select(cs.numeric() - cs.first())
print(out_pl)

shape: (3, 2)
┌─────┬───────────┐
│ id  ┆ sales     │
│ --- ┆ ---       │
│ i64 ┆ f64       │
╞═════╪═══════════╡
│ 9   ┆ 33.4      │
│ 4   ┆ 2142134.1 │
│ 2   ┆ 44.7      │
└─────┴───────────┘

out_pd = df_pd.select_dtypes(np.number).iloc[:, 1:]
print(out_pd.dtypes, end="\n" * 2)
print(out_pd)

id         int64
sales    float64
dtype: object

   id      sales
0   9       33.4
1   4  2142134.1
2   2       44.7

By patterns and substrings

out_pl = df_pl.select(cs.contains("rn"), cs.matches(".*_.*"))
print(out_pl)

shape: (3, 3)
┌─────┬────────────┬─────────────────────┐
│ rn  ┆ has_people ┆ logged_at           │
│ --- ┆ ---        ┆ ---                 │
│ u32 ┆ bool       ┆ datetime[μs]        │
╞═════╪════════════╪═════════════════════╡
│ 0   ┆ false      ┆ 2022-12-01 00:00:00 │
│ 1   ┆ true       ┆ 2022-12-01 00:00:01 │
│ 2   ┆ false      ┆ 2022-12-01 00:00:02 │
└─────┴────────────┴─────────────────────┘

out_pd = df_pd[
    df_pd.filter(like="rn").columns.append(df_pd.filter(regex=".*_.*").columns)
]
print(out_pd)

   rn  has_people           logged_at
0   0       False 2022-12-01 00:00:00
1   1        True 2022-12-01 00:00:01
2   2       False 2022-12-01 00:00:02

Reference

The examples in this section have been adapted from the Polars user guide.