Skip to content

Instantly share code, notes, and snippets.

@orlp

orlp/bench.py Secret

Created March 26, 2024 13:11
Show Gist options
  • Save orlp/053617afe9f01b2c0de7e1b4cc57bdd9 to your computer and use it in GitHub Desktop.
Save orlp/053617afe9f01b2c0de7e1b4cc57bdd9 to your computer and use it in GitHub Desktop.
import polars as pl
import numpy as np
from timeit import timeit
rng = np.random.default_rng(42)
n = 10**7
ages = rng.integers(0, 100, n, dtype=np.uint8, endpoint=True)
heights = rng.uniform(1, 2, n).astype(np.float32)
rode_rollercoaster = rng.uniform(0, 1, n) < 0.5
df = pl.DataFrame({"age": ages, "height": heights, "rode_rollercoaster": rode_rollercoaster})
def bench(name, func):
print(f"benchmarking {name}")
times = [timeit(lambda: func(age_cutoff), number=10) for age_cutoff in range(0, 101)]
out = pl.DataFrame({"age_cutoff": range(0, 101), "time": times})
out.write_csv(name + ".csv")
bench("float-filter-polars-" + pl.__version__, lambda age_cutoff: df.select(pl.col.height.filter(pl.col.age < age_cutoff).mean()))
bench("bool-filter-polars-" + pl.__version__, lambda age_cutoff: df.select(pl.col.rode_rollercoaster.filter(pl.col.age < age_cutoff).mean()))
bench("float-filter-numpy", lambda age_cutoff: heights[ages < age_cutoff])
bench("bool-filter-numpy", lambda age_cutoff: rode_rollercoaster[ages < age_cutoff])
bench("float-whenthen-polars-" + pl.__version__, lambda age_cutoff: df.select(pl.when(pl.col.age < age_cutoff).then(1.0).otherwise(pl.col.height).mean()))
bench("bool-whenthen-polars-" + pl.__version__, lambda age_cutoff: df.select(pl.when(pl.col.age < age_cutoff).then(False).otherwise(pl.col.rode_rollercoaster).mean()))
bench("float-whenthen-numpy", lambda age_cutoff: np.where(ages < age_cutoff, 1.0, heights))
bench("bool-whenthen-numpy", lambda age_cutoff: np.where(ages < age_cutoff, False, rode_rollercoaster))
import polars as pl
import hvplot
from pathlib import Path
old_ver = "0.20.10"
new_ver = "0.20.16"
def join_data(prefix, on):
float_filter_pl_new = pl.read_csv(f"{prefix}-polars-{new_ver}.csv")
float_filter_pl_old = pl.read_csv(f"{prefix}-polars-{old_ver}.csv")
float_filter_np = pl.read_csv(f"{prefix}-numpy.csv")
return (
float_filter_np
.join(float_filter_pl_old, on=on, suffix="_old")
.join(float_filter_pl_new, on=on, suffix="_new")
.with_columns(
pl.col.time.alias("numpy"),
pl.col.time_old.alias(f"polars-{old_ver}"),
pl.col.time_new.alias(f"polars-{new_ver}"),
)
)
Path("out").mkdir(parents=True, exist_ok=True)
data = join_data("float-filter", "age_cutoff")
plt = data.plot.line(x = "age_cutoff", y = ["numpy", f"polars-{old_ver}", f"polars-{new_ver}"], xlabel = "age cutoff (selectivity %)", ylabel="runtime (s)", ylim=(0, None))
hvplot.save(plt, 'out/float-filter-plot.html')
data = join_data("bool-filter", "age_cutoff")
plt = data.plot.line(x = "age_cutoff", y = ["numpy", f"polars-{old_ver}", f"polars-{new_ver}"], xlabel = "age cutoff (selectivity %)", ylabel="runtime (s)", ylim=(0, None))
hvplot.save(plt, 'out/bool-filter-plot.html')
data = join_data("float-whenthen", "age_cutoff")
plt = data.plot.line(x = "age_cutoff", y = ["numpy", f"polars-{old_ver}", f"polars-{new_ver}"], xlabel = "age cutoff (true %)", ylabel="runtime (s)", ylim=(0, None))
hvplot.save(plt, 'out/float-whenthen-plot.html')
data = join_data("bool-whenthen", "age_cutoff")
plt = data.plot.line(x = "age_cutoff", y = ["numpy", f"polars-{old_ver}", f"polars-{new_ver}"], xlabel = "age cutoff (true %)", ylabel="runtime (s)", ylim=(0, None))
hvplot.save(plt, 'out/bool-whenthen-plot.html')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment