Skip to content

Instantly share code, notes, and snippets.

@r-brink
Last active February 5, 2024 08:16
Show Gist options
  • Save r-brink/af7e217bd7c9f5d9a99a854cfad24120 to your computer and use it in GitHub Desktop.
Save r-brink/af7e217bd7c9f5d9a99a854cfad24120 to your computer and use it in GitHub Desktop.
import json
import string
from timeit import timeit
import os
import numpy as np
import polars as pl
os.makedirs("output", exist_ok=True)
polars_version = pl.__version__
n = 1_000_000
def get_mask(n: int, perc: float):
num_true = int(perc * n)
mask = np.array([True] * num_true + [False] * (n - num_true))
np.random.shuffle(mask)
return pl.Series(mask)
def generate_small_values(n):
# Generate n values with less than 12 bytes
lengths = np.random.randint(1, 12, size=n)
return [
"".join(
np.random.choice(list(string.ascii_letters + string.digits), size=length)
)
for length in lengths
]
def generate_medium_values(n):
# Generate n values with sizes between 1 and 200 bytes
lengths = np.random.randint(1, 201, size=n)
return [
"".join(
np.random.choice(list(string.ascii_letters + string.digits), size=length)
)
for length in lengths
]
def generate_large_values(n, target_length=500):
# Generate n values with 500 bytes
all_chars = list(string.ascii_letters + string.digits + " ")
name_lengths = np.random.randint(5, 10, n)
email_lengths = np.random.randint(10, 20, n)
age_values = np.random.randint(18, 101, n)
max_length = np.max(name_lengths) + np.max(email_lengths) + target_length
char_pool = np.random.choice(all_chars, size=n * max_length)
json_objects = []
char_index = 0
for i in range(n):
name = "".join(char_pool[char_index : char_index + name_lengths[i]])
char_index += name_lengths[i]
email = (
"".join(char_pool[char_index : char_index + email_lengths[i]])
+ "@example.com"
)
char_index += email_lengths[i]
base_obj = {"name": name, "age": int(age_values[i]), "email": email}
base_json = json.dumps(base_obj)
message_length = target_length - len(base_json) - 10
message = "".join(char_pool[char_index : char_index + max(message_length, 0)])
char_index += message_length
base_obj["message"] = message
json_str = json.dumps(base_obj)[:target_length]
json_objects.append(json_str)
return json_objects
print("generating masks")
mask05 = get_mask(n, 0.05)
mask50 = get_mask(n, 0.5)
mask95 = get_mask(n, 0.95)
print("generating small data")
df_small = pl.DataFrame(
{
"small": generate_small_values(n),
}
)
df_small16 = df_small.with_columns(
[pl.col("small").alias(f"s{i}") for i in range(1, 17)]
)
print("generating medium data")
df_medium = pl.DataFrame(
{
"medium": generate_medium_values(n),
}
)
df_medium16 = df_medium.with_columns(
[pl.col("medium").alias(f"s{i}") for i in range(1, 17)]
)
print("generating large data")
df_large = pl.DataFrame(
{
"large": generate_large_values(n),
}
)
df_large16 = df_large.with_columns(
[pl.col("large").alias(f"s{i}") for i in range(1, 17)]
)
def test_filter_performance(df, csv_filename, polars_version=polars_version):
results = [
{
"Polars_Version": polars_version,
"Mask": "0.05",
"Time": timeit(lambda: df.filter(mask05), number=10),
},
{
"Polars_Version": polars_version,
"Mask": "0.50",
"Time": timeit(lambda: df.filter(mask50), number=10),
},
{
"Polars_Version": polars_version,
"Mask": "0.95",
"Time": timeit(lambda: df.filter(mask95), number=10),
},
]
# Write results to CSV
return pl.DataFrame(results).write_csv(
f"output/{csv_filename}_{polars_version}.csv"
)
print("run benchmarks")
test_filter_performance(df_small, "filter_small16")
test_filter_performance(df_medium, "filter_medium16")
test_filter_performance(df_large16, "filter_large16")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment