-
-
Save alexander-beedie/a732e4213709696c8d603294d2e1c25c to your computer and use it in GitHub Desktop.
Benchmark polars literal string replace
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from polars.testing.parametric import column, dataframes | |
from hypothesis.strategies import sampled_from | |
from random import randint, shuffle | |
from codetiming import Timer | |
import polars as pl | |
def create_test_data( strlen: int, replace_pct: int, replace_position: str ) -> list: | |
s = 'a' * strlen | |
data = [s] * 10_000 | |
if replace_pct == 0: | |
return data | |
for idx, d in enumerate( data ): | |
if replace_pct == 100 or randint( 1, 100 ) <= replace_pct: | |
if replace_position == 'front': | |
data[idx] = "xxxy" + d[4:] | |
elif replace_position == 'back': | |
data[idx] = d[:-3] + "xxxy" | |
elif replace_position == 'random': | |
pos = randint( 0, strlen - 4 ) | |
data[idx] = d[:pos] + "xxxy" + d[pos:-4] | |
return data * 10 | |
def create_test_dataframe( ncols: int, strlen: int, replace_pct: int, replace_position: str ): | |
return pl.DataFrame( | |
data={ f"c{idx}":create_test_data( | |
strlen, | |
replace_pct, | |
replace_position, | |
) for idx in range( ncols ) } | |
) | |
def benchmark( | |
strlens: list[int] = None, | |
replace_all: bool = False, | |
replace_position: str = 'random', | |
n_repeat: int = 10, | |
): | |
tm_total = 0.0 | |
bench_results = [] | |
for replace_pct in range( 0, 101, 5 ): | |
for strlen in strlens or (8, 16, 32, 33, 64, 128, 256, 512, 1024, 2048, 4096): | |
df = create_test_dataframe( 5, strlen, replace_pct, replace_position ) | |
with Timer( text=f"{{:0.3f}},{strlen},{replace_pct}%", logger=None ) as tm: | |
for _ in range( n_repeat ): | |
dfx = df.select( | |
pl.all().str.replace_all( "x", "???", literal=True ) | |
if replace_all else | |
pl.all().str.replace( "xxxy", "???", literal=True ) | |
) | |
tm_taken = tm.last / n_repeat | |
tm_total += tm_taken | |
bench_results.append( [strlen, replace_pct, tm_taken] ) | |
print( f"{replace_pct}%" ) | |
df_bench = pl.DataFrame( | |
data=bench_results, | |
schema={ "strlen":int, "replace_pct":int, "time_taken":float } | |
) | |
print( f"-------\nTotal: {tm_total:.3} secs" ) | |
return df_bench.sort( by=["strlen", "replace_pct"] ) | |
df = benchmark() | |
print( df ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment