junghoon-son/all_columns_search.py

## all_columns_search.py
import polars as pl


@pl.api.register_dataframe_namespace("all_columns_search")
class AllColumnsSearch:
    def __init__(self, df: pl.DataFrame):
        self._df = df

    def contains(self, regex: str, matches_only: bool = False):
        dfs = []
        row_count = self._df.shape[0]
        # As much as loops might need to be avoided, in cases like this, loops are actually faster
        for col in self._df.columns:
            row_df = (
                self._df.select(pl.col(col).cast(pl.Utf8()))
                .filter(
                    pl.col(col).str.contains(regex))
                .groupby(
                    pl.lit(col).alias("column_name"))
                .agg(
                    pl.col(col).alias("matches"),
                    pl.col(col).len().alias("n"))
            )

            # Create an empty row, if there are no matches.
            if (len(row_df) == 0) and (not matches_only):
                row_df = pl.DataFrame(
                    {
                        "column_name": col,
                        "matches": pl.Series("empty lists", [[]], dtype=pl.List),
                        "n": 0,
                    }
                )

            # Append the row with casted types
            dfs.append(
                row_df.select(
                    pl.col("column_name").cast(pl.Utf8()),
                    pl.col("matches").cast(pl.List(pl.Utf8())),
                    pl.col("n").cast(pl.UInt32()),
                    (pl.col("n")/pl.lit(row_count)).cast(pl.Float64).alias("percent")
                )
            )

        return pl.concat(dfs, how="vertical")
	import polars as pl


	@pl.api.register_dataframe_namespace("all_columns_search")
	class AllColumnsSearch:
	def __init__(self, df: pl.DataFrame):
	self._df = df

	def contains(self, regex: str, matches_only: bool = False):
	dfs = []
	row_count = self._df.shape[0]
	# As much as loops might need to be avoided, in cases like this, loops are actually faster
	for col in self._df.columns:
	row_df = (
	self._df.select(pl.col(col).cast(pl.Utf8()))
	.filter(
	pl.col(col).str.contains(regex))
	.groupby(
	pl.lit(col).alias("column_name"))
	.agg(
	pl.col(col).alias("matches"),
	pl.col(col).len().alias("n"))
	)

	# Create an empty row, if there are no matches.
	if (len(row_df) == 0) and (not matches_only):
	row_df = pl.DataFrame(
	{
	"column_name": col,
	"matches": pl.Series("empty lists", [[]], dtype=pl.List),
	"n": 0,
	}
	)

	# Append the row with casted types
	dfs.append(
	row_df.select(
	pl.col("column_name").cast(pl.Utf8()),
	pl.col("matches").cast(pl.List(pl.Utf8())),
	pl.col("n").cast(pl.UInt32()),
	(pl.col("n")/pl.lit(row_count)).cast(pl.Float64).alias("percent")
	)
	)

	return pl.concat(dfs, how="vertical")