Describe method for expressions
At the moment, Polars
lacks built-in expressions
for generating a describe
summary. Nevertheless, it is possible to create custom expressions to achieve a similar result manually.
from datetime import date
import polars as pl
def to_describe(col, prefix=""):
prefix = prefix or f"{col}_"
return [
pl.col(col).count().alias(f"{prefix}count"),
pl.col(col).is_null().sum().alias(f"{prefix}null_count"),
pl.col(col).mean().alias(f"{prefix}mean"),
pl.col(col).std().alias(f"{prefix}std"),
pl.col(col).min().alias(f"{prefix}min"),
pl.col(col).quantile(0.25).alias(f"{prefix}25%"),
pl.col(col).quantile(0.5).alias(f"{prefix}50%"),
pl.col(col).quantile(0.75).alias(f"{prefix}75%"),
pl.col(col).max().alias(f"{prefix}max"),
]
df = pl.DataFrame(
{
"date": [
date(2023, 9, 5),
date(2023, 9, 25),
date(2023, 10, 5),
date(2023, 10, 25),
date(2023, 11, 5),
date(2023, 11, 25),
],
"a": [1, 3, 2, 15, 10, None],
"b": [None, 11, 13, 12, 115, 110],
}
)
out = (
df.group_by(pl.col("date").dt.month().alias("month"))
.agg(*to_describe("a"), *to_describe("b"))
.sort("month")
)
print(out)
shape: (3, 19)
┌───────┬─────────┬──────────────┬────────┬───┬───────┬───────┬───────┬───────┐
│ month ┆ a_count ┆ a_null_count ┆ a_mean ┆ … ┆ b_25% ┆ b_50% ┆ b_75% ┆ b_max │
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ u32 ┆ u32 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ i64 │
╞═══════╪═════════╪══════════════╪════════╪═══╪═══════╪═══════╪═══════╪═══════╡
│ 9 ┆ 2 ┆ 0 ┆ 2.0 ┆ … ┆ 11.0 ┆ 11.0 ┆ 11.0 ┆ 11 │
│ 10 ┆ 2 ┆ 0 ┆ 8.5 ┆ … ┆ 12.0 ┆ 13.0 ┆ 13.0 ┆ 13 │
│ 11 ┆ 2 ┆ 1 ┆ 10.0 ┆ … ┆ 110.0 ┆ 115.0 ┆ 115.0 ┆ 115 │
└───────┴─────────┴──────────────┴────────┴───┴───────┴───────┴───────┴───────┘