Skip to content

Instantly share code, notes, and snippets.

@r-brink
Last active January 4, 2024 14:36
Show Gist options
  • Save r-brink/18760e68f3253ce8efdbe7a606931b9e to your computer and use it in GitHub Desktop.
Save r-brink/18760e68f3253ce8efdbe7a606931b9e to your computer and use it in GitHub Desktop.
import time
import numpy as np
import pandas as pd
import polars as pl
SIZE = int(100_000_000)
N_RUNS = 10
np.random.seed(42)
df = pl.DataFrame(
{
"a": np.random.normal(size=SIZE),
"b": np.random.random(size=SIZE),
}
)
pandas_df = df.to_pandas()
numpy_array = df.to_numpy()
def format_timing(seconds: float):
if seconds >= 1:
return f"{seconds:.3f} s"
elif (seconds > 0.001) and (seconds < 1):
return f"{seconds*1000:.3f} ms"
else:
return f"{seconds*1_000_000:.3f} μs"
def benchmark(data, func: callable, n_runs: int, name: str):
# Check the type of input data
if isinstance(data, pl.DataFrame):
data_type = "Polars DataFrame"
version = pl.__version__
elif isinstance(data, pd.DataFrame):
data_type = "Pandas DataFrame"
version = pd.__version__
elif isinstance(data, np.ndarray):
data_type = "NumPy Array"
version = np.__version__
else:
raise TypeError(
"The input data must be either a Polars DataFrame, a Pandas DataFrame, or a NumPy Array"
)
timings = []
for _ in range(n_runs):
t0 = time.time()
func(data)
t = time.time() - t0
timings.append(t)
print(
f"""
{data_type} version: {version}
Operation: {name}
----------------
{n_runs} runs took
avg per run: {format_timing(np.mean(timings))},
stdev per run: {format_timing(np.std(timings))},
total time: {format_timing(np.sum(timings))}
"""
)
return np.mean(timings)
def cov(df: pl.DataFrame):
return df.select(pl.cov("a", "b"))
def corr(df: pl.DataFrame):
return df.select(pl.corr("a", "b"))
def pandas_corr(pandas_df: pd.DataFrame):
return pandas_df.corr()
def pandas_cov(pandas_df: pd.DataFrame):
return pandas_df.cov()
def numpy_corr(array: np.ndarray):
return np.corrcoef(array[:, 0], array[:, 1])
def numpy_cov(array: np.ndarray):
return np.cov(array[:, 0], array[:, 1])
benchmark_results = {
"Corr with Polars": benchmark(
data=df, func=lambda data: corr(df=data), n_runs=N_RUNS, name="Corr with Polars"
),
"Cov with Polars": benchmark(
data=df, func=lambda data: cov(df=data), n_runs=N_RUNS, name="Cov with Polars"
),
"Corr with Pandas": benchmark(
data=pandas_df,
func=lambda data: pandas_corr(data),
n_runs=N_RUNS,
name="Corr with Pandas",
),
"Cov with Pandas": benchmark(
data=pandas_df,
func=lambda data: pandas_cov(data),
n_runs=N_RUNS,
name="Cov with Pandas",
),
"Corr with NumPy": benchmark(
data=numpy_array,
func=lambda data: numpy_corr(data),
n_runs=N_RUNS,
name="Corr with NumPy",
),
"Cov with NumPy": benchmark(
data=numpy_array,
func=lambda data: numpy_cov(data),
n_runs=N_RUNS,
name="Cov with NumPy",
),
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment