Skip to content

Instantly share code, notes, and snippets.

@alexander-beedie
Last active February 10, 2023 12:52
Show Gist options
  • Save alexander-beedie/a732e4213709696c8d603294d2e1c25c to your computer and use it in GitHub Desktop.
Save alexander-beedie/a732e4213709696c8d603294d2e1c25c to your computer and use it in GitHub Desktop.
Benchmark polars literal string replace
from polars.testing.parametric import column, dataframes
from hypothesis.strategies import sampled_from
from random import randint, shuffle
from codetiming import Timer
import polars as pl
def create_test_data( strlen: int, replace_pct: int, replace_position: str ) -> list:
s = 'a' * strlen
data = [s] * 10_000
if replace_pct == 0:
return data
for idx, d in enumerate( data ):
if replace_pct == 100 or randint( 1, 100 ) <= replace_pct:
if replace_position == 'front':
data[idx] = "xxxy" + d[4:]
elif replace_position == 'back':
data[idx] = d[:-3] + "xxxy"
elif replace_position == 'random':
pos = randint( 0, strlen - 4 )
data[idx] = d[:pos] + "xxxy" + d[pos:-4]
return data * 10
def create_test_dataframe( ncols: int, strlen: int, replace_pct: int, replace_position: str ):
return pl.DataFrame(
data={ f"c{idx}":create_test_data(
strlen,
replace_pct,
replace_position,
) for idx in range( ncols ) }
)
def benchmark(
strlens: list[int] = None,
replace_all: bool = False,
replace_position: str = 'random',
n_repeat: int = 10,
):
tm_total = 0.0
bench_results = []
for replace_pct in range( 0, 101, 5 ):
for strlen in strlens or (8, 16, 32, 33, 64, 128, 256, 512, 1024, 2048, 4096):
df = create_test_dataframe( 5, strlen, replace_pct, replace_position )
with Timer( text=f"{{:0.3f}},{strlen},{replace_pct}%", logger=None ) as tm:
for _ in range( n_repeat ):
dfx = df.select(
pl.all().str.replace_all( "x", "???", literal=True )
if replace_all else
pl.all().str.replace( "xxxy", "???", literal=True )
)
tm_taken = tm.last / n_repeat
tm_total += tm_taken
bench_results.append( [strlen, replace_pct, tm_taken] )
print( f"{replace_pct}%" )
df_bench = pl.DataFrame(
data=bench_results,
schema={ "strlen":int, "replace_pct":int, "time_taken":float }
)
print( f"-------\nTotal: {tm_total:.3} secs" )
return df_bench.sort( by=["strlen", "replace_pct"] )
df = benchmark()
print( df )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment