Skip to content

Instantly share code, notes, and snippets.

@junghoon-son
Created May 16, 2023 22:28
import polars as pl
@pl.api.register_dataframe_namespace("all_columns_search")
class AllColumnsSearch:
def __init__(self, df: pl.DataFrame):
self._df = df
def contains(self, regex: str, matches_only: bool = False):
dfs = []
row_count = self._df.shape[0]
# As much as loops might need to be avoided, in cases like this, loops are actually faster
for col in self._df.columns:
row_df = (
self._df.select(pl.col(col).cast(pl.Utf8()))
.filter(
pl.col(col).str.contains(regex))
.groupby(
pl.lit(col).alias("column_name"))
.agg(
pl.col(col).alias("matches"),
pl.col(col).len().alias("n"))
)
# Create an empty row, if there are no matches.
if (len(row_df) == 0) and (not matches_only):
row_df = pl.DataFrame(
{
"column_name": col,
"matches": pl.Series("empty lists", [[]], dtype=pl.List),
"n": 0,
}
)
# Append the row with casted types
dfs.append(
row_df.select(
pl.col("column_name").cast(pl.Utf8()),
pl.col("matches").cast(pl.List(pl.Utf8())),
pl.col("n").cast(pl.UInt32()),
(pl.col("n")/pl.lit(row_count)).cast(pl.Float64).alias("percent")
)
)
return pl.concat(dfs, how="vertical")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment