Last active
January 4, 2024 14:36
-
-
Save r-brink/18760e68f3253ce8efdbe7a606931b9e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import numpy as np | |
import pandas as pd | |
import polars as pl | |
SIZE = int(100_000_000) | |
N_RUNS = 10 | |
np.random.seed(42) | |
df = pl.DataFrame( | |
{ | |
"a": np.random.normal(size=SIZE), | |
"b": np.random.random(size=SIZE), | |
} | |
) | |
pandas_df = df.to_pandas() | |
numpy_array = df.to_numpy() | |
def format_timing(seconds: float): | |
if seconds >= 1: | |
return f"{seconds:.3f} s" | |
elif (seconds > 0.001) and (seconds < 1): | |
return f"{seconds*1000:.3f} ms" | |
else: | |
return f"{seconds*1_000_000:.3f} μs" | |
def benchmark(data, func: callable, n_runs: int, name: str): | |
# Check the type of input data | |
if isinstance(data, pl.DataFrame): | |
data_type = "Polars DataFrame" | |
version = pl.__version__ | |
elif isinstance(data, pd.DataFrame): | |
data_type = "Pandas DataFrame" | |
version = pd.__version__ | |
elif isinstance(data, np.ndarray): | |
data_type = "NumPy Array" | |
version = np.__version__ | |
else: | |
raise TypeError( | |
"The input data must be either a Polars DataFrame, a Pandas DataFrame, or a NumPy Array" | |
) | |
timings = [] | |
for _ in range(n_runs): | |
t0 = time.time() | |
func(data) | |
t = time.time() - t0 | |
timings.append(t) | |
print( | |
f""" | |
{data_type} version: {version} | |
Operation: {name} | |
---------------- | |
{n_runs} runs took | |
avg per run: {format_timing(np.mean(timings))}, | |
stdev per run: {format_timing(np.std(timings))}, | |
total time: {format_timing(np.sum(timings))} | |
""" | |
) | |
return np.mean(timings) | |
def cov(df: pl.DataFrame): | |
return df.select(pl.cov("a", "b")) | |
def corr(df: pl.DataFrame): | |
return df.select(pl.corr("a", "b")) | |
def pandas_corr(pandas_df: pd.DataFrame): | |
return pandas_df.corr() | |
def pandas_cov(pandas_df: pd.DataFrame): | |
return pandas_df.cov() | |
def numpy_corr(array: np.ndarray): | |
return np.corrcoef(array[:, 0], array[:, 1]) | |
def numpy_cov(array: np.ndarray): | |
return np.cov(array[:, 0], array[:, 1]) | |
benchmark_results = { | |
"Corr with Polars": benchmark( | |
data=df, func=lambda data: corr(df=data), n_runs=N_RUNS, name="Corr with Polars" | |
), | |
"Cov with Polars": benchmark( | |
data=df, func=lambda data: cov(df=data), n_runs=N_RUNS, name="Cov with Polars" | |
), | |
"Corr with Pandas": benchmark( | |
data=pandas_df, | |
func=lambda data: pandas_corr(data), | |
n_runs=N_RUNS, | |
name="Corr with Pandas", | |
), | |
"Cov with Pandas": benchmark( | |
data=pandas_df, | |
func=lambda data: pandas_cov(data), | |
n_runs=N_RUNS, | |
name="Cov with Pandas", | |
), | |
"Corr with NumPy": benchmark( | |
data=numpy_array, | |
func=lambda data: numpy_corr(data), | |
n_runs=N_RUNS, | |
name="Corr with NumPy", | |
), | |
"Cov with NumPy": benchmark( | |
data=numpy_array, | |
func=lambda data: numpy_cov(data), | |
n_runs=N_RUNS, | |
name="Cov with NumPy", | |
), | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment