alexander-beedie/polars_str_bench.py Secret

## polars_str_bench.py
from polars.testing.parametric import column, dataframes
from hypothesis.strategies import sampled_from
from random import randint, shuffle
from codetiming import Timer
import polars as pl


def create_test_data( strlen: int, replace_pct: int, replace_position: str ) -> list:
    s = 'a' * strlen
    data = [s] * 10_000
    if replace_pct == 0:
        return data

    for idx, d in enumerate( data ):
        if replace_pct == 100 or randint( 1, 100 ) <= replace_pct:
            if replace_position == 'front':
                data[idx] = "xxxy" + d[4:]
            elif replace_position == 'back':
                data[idx] = d[:-3] + "xxxy"
            elif replace_position == 'random':
                pos = randint( 0, strlen - 4 )
                data[idx] = d[:pos] + "xxxy" + d[pos:-4]
    return data * 10

def create_test_dataframe( ncols: int, strlen: int, replace_pct: int, replace_position: str ):
    return pl.DataFrame(
        data={ f"c{idx}":create_test_data(
            strlen,
            replace_pct,
            replace_position,
        ) for idx in range( ncols ) }
    )

def benchmark(
    strlens: list[int] = None,
    replace_all: bool = False,
    replace_position: str = 'random',
    n_repeat: int = 10,
):
    tm_total = 0.0
    bench_results = []
    for replace_pct in range( 0, 101, 5 ):
        for strlen in strlens or (8, 16, 32, 33, 64, 128, 256, 512, 1024, 2048, 4096):
            df = create_test_dataframe( 5, strlen, replace_pct, replace_position )
            with Timer( text=f"{{:0.3f}},{strlen},{replace_pct}%", logger=None ) as tm:
                for _ in range( n_repeat ):
                    dfx = df.select(
                        pl.all().str.replace_all( "x", "???", literal=True )
                        if replace_all else
                        pl.all().str.replace( "xxxy", "???", literal=True )
                    )
            tm_taken = tm.last / n_repeat
            tm_total += tm_taken
            bench_results.append( [strlen, replace_pct, tm_taken] )

        print( f"{replace_pct}%" )

    df_bench = pl.DataFrame(
        data=bench_results,
        schema={ "strlen":int, "replace_pct":int, "time_taken":float }
    )
    print( f"-------\nTotal: {tm_total:.3} secs" )
    return df_bench.sort( by=["strlen", "replace_pct"] )


df = benchmark()
print( df )
	from polars.testing.parametric import column, dataframes
	from hypothesis.strategies import sampled_from
	from random import randint, shuffle
	from codetiming import Timer
	import polars as pl


	def create_test_data( strlen: int, replace_pct: int, replace_position: str ) -> list:
	s = 'a' * strlen
	data = [s] * 10_000
	if replace_pct == 0:
	return data

	for idx, d in enumerate( data ):
	if replace_pct == 100 or randint( 1, 100 ) <= replace_pct:
	if replace_position == 'front':
	data[idx] = "xxxy" + d[4:]
	elif replace_position == 'back':
	data[idx] = d[:-3] + "xxxy"
	elif replace_position == 'random':
	pos = randint( 0, strlen - 4 )
	data[idx] = d[:pos] + "xxxy" + d[pos:-4]
	return data * 10

	def create_test_dataframe( ncols: int, strlen: int, replace_pct: int, replace_position: str ):
	return pl.DataFrame(
	data={ f"c{idx}":create_test_data(
	strlen,
	replace_pct,
	replace_position,
	) for idx in range( ncols ) }
	)

	def benchmark(
	strlens: list[int] = None,
	replace_all: bool = False,
	replace_position: str = 'random',
	n_repeat: int = 10,
	):
	tm_total = 0.0
	bench_results = []
	for replace_pct in range( 0, 101, 5 ):
	for strlen in strlens or (8, 16, 32, 33, 64, 128, 256, 512, 1024, 2048, 4096):
	df = create_test_dataframe( 5, strlen, replace_pct, replace_position )
	with Timer( text=f"{{:0.3f}},{strlen},{replace_pct}%", logger=None ) as tm:
	for _ in range( n_repeat ):
	dfx = df.select(
	pl.all().str.replace_all( "x", "???", literal=True )
	if replace_all else
	pl.all().str.replace( "xxxy", "???", literal=True )
	)
	tm_taken = tm.last / n_repeat
	tm_total += tm_taken
	bench_results.append( [strlen, replace_pct, tm_taken] )

	print( f"{replace_pct}%" )

	df_bench = pl.DataFrame(
	data=bench_results,
	schema={ "strlen":int, "replace_pct":int, "time_taken":float }
	)
	print( f"-------\nTotal: {tm_total:.3} secs" )
	return df_bench.sort( by=["strlen", "replace_pct"] )


	df = benchmark()
	print( df )