-
-
Save r-brink/af7e217bd7c9f5d9a99a854cfad24120 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import string | |
from timeit import timeit | |
import os | |
import numpy as np | |
import polars as pl | |
os.makedirs("output", exist_ok=True) | |
polars_version = pl.__version__ | |
n = 1_000_000 | |
def get_mask(n: int, perc: float): | |
num_true = int(perc * n) | |
mask = np.array([True] * num_true + [False] * (n - num_true)) | |
np.random.shuffle(mask) | |
return pl.Series(mask) | |
def generate_small_values(n): | |
# Generate n values with less than 12 bytes | |
lengths = np.random.randint(1, 12, size=n) | |
return [ | |
"".join( | |
np.random.choice(list(string.ascii_letters + string.digits), size=length) | |
) | |
for length in lengths | |
] | |
def generate_medium_values(n): | |
# Generate n values with sizes between 1 and 200 bytes | |
lengths = np.random.randint(1, 201, size=n) | |
return [ | |
"".join( | |
np.random.choice(list(string.ascii_letters + string.digits), size=length) | |
) | |
for length in lengths | |
] | |
def generate_large_values(n, target_length=500): | |
# Generate n values with 500 bytes | |
all_chars = list(string.ascii_letters + string.digits + " ") | |
name_lengths = np.random.randint(5, 10, n) | |
email_lengths = np.random.randint(10, 20, n) | |
age_values = np.random.randint(18, 101, n) | |
max_length = np.max(name_lengths) + np.max(email_lengths) + target_length | |
char_pool = np.random.choice(all_chars, size=n * max_length) | |
json_objects = [] | |
char_index = 0 | |
for i in range(n): | |
name = "".join(char_pool[char_index : char_index + name_lengths[i]]) | |
char_index += name_lengths[i] | |
email = ( | |
"".join(char_pool[char_index : char_index + email_lengths[i]]) | |
+ "@example.com" | |
) | |
char_index += email_lengths[i] | |
base_obj = {"name": name, "age": int(age_values[i]), "email": email} | |
base_json = json.dumps(base_obj) | |
message_length = target_length - len(base_json) - 10 | |
message = "".join(char_pool[char_index : char_index + max(message_length, 0)]) | |
char_index += message_length | |
base_obj["message"] = message | |
json_str = json.dumps(base_obj)[:target_length] | |
json_objects.append(json_str) | |
return json_objects | |
print("generating masks") | |
mask05 = get_mask(n, 0.05) | |
mask50 = get_mask(n, 0.5) | |
mask95 = get_mask(n, 0.95) | |
print("generating small data") | |
df_small = pl.DataFrame( | |
{ | |
"small": generate_small_values(n), | |
} | |
) | |
df_small16 = df_small.with_columns( | |
[pl.col("small").alias(f"s{i}") for i in range(1, 17)] | |
) | |
print("generating medium data") | |
df_medium = pl.DataFrame( | |
{ | |
"medium": generate_medium_values(n), | |
} | |
) | |
df_medium16 = df_medium.with_columns( | |
[pl.col("medium").alias(f"s{i}") for i in range(1, 17)] | |
) | |
print("generating large data") | |
df_large = pl.DataFrame( | |
{ | |
"large": generate_large_values(n), | |
} | |
) | |
df_large16 = df_large.with_columns( | |
[pl.col("large").alias(f"s{i}") for i in range(1, 17)] | |
) | |
def test_filter_performance(df, csv_filename, polars_version=polars_version): | |
results = [ | |
{ | |
"Polars_Version": polars_version, | |
"Mask": "0.05", | |
"Time": timeit(lambda: df.filter(mask05), number=10), | |
}, | |
{ | |
"Polars_Version": polars_version, | |
"Mask": "0.50", | |
"Time": timeit(lambda: df.filter(mask50), number=10), | |
}, | |
{ | |
"Polars_Version": polars_version, | |
"Mask": "0.95", | |
"Time": timeit(lambda: df.filter(mask95), number=10), | |
}, | |
] | |
# Write results to CSV | |
return pl.DataFrame(results).write_csv( | |
f"output/{csv_filename}_{polars_version}.csv" | |
) | |
print("run benchmarks") | |
test_filter_performance(df_small, "filter_small16") | |
test_filter_performance(df_medium, "filter_medium16") | |
test_filter_performance(df_large16, "filter_large16") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment