Skip to content

Strings

Setup

import pandas as pd
import polars as pl

Check for existence of a pattern

data1 = {"animal": ["Crab", "cat and dog", "rab$bit", None]}

df_pl = pl.DataFrame(data1)
print(df_pl)

shape: (4, 1)
┌─────────────┐
│ animal      │
│ ---         │
│ str         │
╞═════════════╡
│ Crab        │
│ cat and dog │
│ rab$bit     │
│ null        │
└─────────────┘

df_pd = pd.DataFrame(data1)
print(df_pd)

        animal
0         Crab
1  cat and dog
2      rab$bit
3         None

out_pl = df_pl.select(
    pl.col("animal"),
    pl.col("animal").str.contains("cat|bit").alias("regex"),
    pl.col("animal").str.contains("rab$", literal=True).alias("literal"),
    pl.col("animal").str.starts_with("rab").alias("starts_with"),
    pl.col("animal").str.ends_with("dog").alias("ends_with"),
)
print(out_pl)

shape: (4, 5)
┌─────────────┬───────┬─────────┬─────────────┬───────────┐
│ animal      ┆ regex ┆ literal ┆ starts_with ┆ ends_with │
│ ---         ┆ ---   ┆ ---     ┆ ---         ┆ ---       │
│ str         ┆ bool  ┆ bool    ┆ bool        ┆ bool      │
╞═════════════╪═══════╪═════════╪═════════════╪═══════════╡
│ Crab        ┆ false ┆ false   ┆ false       ┆ false     │
│ cat and dog ┆ true  ┆ false   ┆ false       ┆ true      │
│ rab$bit     ┆ true  ┆ true    ┆ true        ┆ false     │
│ null        ┆ null  ┆ null    ┆ null        ┆ null      │
└─────────────┴───────┴─────────┴─────────────┴───────────┘

out_pd = df_pd.assign(
    animal=lambda df_: df_.animal,
    regex=lambda df_: df_.animal.str.contains("cat|bit"),
    literal=lambda df_: df_.animal.str.contains("rab$", regex=False),
    starts_with=lambda df_: df_.animal.str.startswith("rab"),
    ends_with=lambda df_: df_.animal.str.endswith("dog"),
)
print(out_pd)

        animal  regex literal starts_with ends_with
0         Crab  False   False       False     False
1  cat and dog   True   False       False      True
2      rab$bit   True    True        True     False
3         None   None    None        None      None

There's a slight difference in syntax between Polars and Pandas when it comes to methods for checking whether the start and end of each string element matches a given pattern.

  • In Polars, you use pl.col(..).str.starts_with(..) and pl.col(..).str.ends_with(..).
  • In Pandas, the equivalent methods are pd.Series.str.startswith(..) and pd.Series.str.endswith(..).

Extract a pattern

data2 = {
    "a": [
        "http://vote.com/ballon_dor?candidate=messi&ref=polars",
        "http://vote.com/ballon_dor?candidat=jorginho&ref=polars",
        "http://vote.com/ballon_dor?candidate=ronaldo&ref=polars",
    ]
}

df_pl = pl.DataFrame(data2)
print(df_pl)

shape: (3, 1)
┌───────────────────────────────────┐
│ a                                 │
│ ---                               │
│ str                               │
╞═══════════════════════════════════╡
│ http://vote.com/ballon_dor?candi… │
│ http://vote.com/ballon_dor?candi… │
│ http://vote.com/ballon_dor?candi… │
└───────────────────────────────────┘

df_pd = pd.DataFrame(data2)
print(df_pd)

                                                   a
0  http://vote.com/ballon_dor?candidate=messi&ref...
1  http://vote.com/ballon_dor?candidat=jorginho&r...
2  http://vote.com/ballon_dor?candidate=ronaldo&r...

out_pl = df_pl.select(pl.col("a").str.extract(r"candidate=(\w+)", group_index=1))
print(out_pl)

shape: (3, 1)
┌─────────┐
│ a       │
│ ---     │
│ str     │
╞═════════╡
│ messi   │
│ null    │
│ ronaldo │
└─────────┘

out_pd = df_pd.assign(a=lambda df_: df_.a.str.extract(r"candidate=(\w+)"))
print(out_pd)

         a
0    messi
1      NaN
2  ronaldo

Extract all occurrences of a pattern

data3 = {"foo": ["123 bla 45 asd", "xyz 678 910t"]}

df_pl = pl.DataFrame(data3)
print(df_pl)

shape: (2, 1)
┌────────────────┐
│ foo            │
│ ---            │
│ str            │
╞════════════════╡
│ 123 bla 45 asd │
│ xyz 678 910t   │
└────────────────┘

df_pd = pd.DataFrame(data3)
print(df_pd)

              foo
0  123 bla 45 asd
1    xyz 678 910t

out_pl = df_pl.select(pl.col("foo").str.extract_all(r"(\d+)").alias("extracted_nrs"))
print(out_pl)

shape: (2, 1)
┌────────────────┐
│ extracted_nrs  │
│ ---            │
│ list[str]      │
╞════════════════╡
│ ["123", "45"]  │
│ ["678", "910"] │
└────────────────┘

out_pd = (
    df_pd.foo.str.extractall(r"(\d+)")
    .droplevel("match", axis="rows")
    .reset_index()
    .groupby("index")
    .agg(list)
    .rename(columns={0: "extracted_nrs"})
)
print(out_pd)

      extracted_nrs
index              
0         [123, 45]
1        [678, 910]

Replace a pattern

data4 = {"id": [1, 2], "text": ["123abc", "abc456"]}

df_pl = pl.DataFrame(data4)
print(df_pl)

shape: (2, 2)
┌─────┬────────┐
│ id  ┆ text   │
│ --- ┆ ---    │
│ i64 ┆ str    │
╞═════╪════════╡
│ 1   ┆ 123abc │
│ 2   ┆ abc456 │
└─────┴────────┘

df_pd = pd.DataFrame(data4)
print(df_pd)

   id    text
0   1  123abc
1   2  abc456

out_pl = df_pl.with_columns(
    pl.col("text").str.replace(r"abc\b", "ABC"),
    pl.col("text").str.replace_all("a", "-", literal=True).alias("text_replace_all"),
)
print(out_pl)

shape: (2, 3)
┌─────┬────────┬──────────────────┐
│ id  ┆ text   ┆ text_replace_all │
│ --- ┆ ---    ┆ ---              │
│ i64 ┆ str    ┆ str              │
╞═════╪════════╪══════════════════╡
│ 1   ┆ 123ABC ┆ 123-bc           │
│ 2   ┆ abc456 ┆ -bc456           │
└─────┴────────┴──────────────────┘

out_pd = df_pd.assign(
    text=lambda df_: df_.text.str.replace(r"abc\b", "ABC", n=1, regex=True),
    text_replace_all=lambda _: df_pd.text.str.replace("a", "-"),
)
print(out_pd)

   id    text text_replace_all
0   1  123ABC           123-bc
1   2  abc456           -bc456

Reference

The examples in this section have been adapted from the Polars user guide.