Skip to content

Instantly share code, notes, and snippets.

@alexhornbake
Created December 7, 2023 08:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alexhornbake/89b2b67037640fa8377bfab95303b7ec to your computer and use it in GitHub Desktop.
Save alexhornbake/89b2b67037640fa8377bfab95303b7ec to your computer and use it in GitHub Desktop.
Cost of converting to/from polars to join a dataframe
# mprof run polars_test.py
# mprof: Sampling memory every 0.1s
# running new process
# running as a Python program...
# 1701938865.155861 - starting
# 1701938930.560897 - time to generate dataframes: 65.40504503250122
# 1701938930.5609741 - starting pandas join
# Filename: polars_test.py
# Line # Mem usage Increment Occurrences Line Contents
# =============================================================
# 20 743.5 MiB 743.5 MiB 1 @profile
# 21 def pandas_join(df1, df2):
# 22 1242.6 MiB 499.1 MiB 1 return df1.merge(df2, left_on=['string1', 'string2'], right_on=['string1', 'string2'], how='left')
# 1701938932.990345 - time to pandas join: 2.4293649196624756
# 1701938932.9903831 - starting polars convert
# Filename: polars_test.py
# Line # Mem usage Increment Occurrences Line Contents
# =============================================================
# 25 1204.7 MiB 1204.7 MiB 1 @profile
# 26 def polars_convert(df1, df2):
# 27 1190.5 MiB -14.2 MiB 1 pl_df1 = pl.from_pandas(df1)
# 28 1272.8 MiB 82.2 MiB 1 pl_df2 = pl.from_pandas(df2)
# 29 1272.8 MiB 0.0 MiB 1 return pl_df1, pl_df2
# 1701938933.733557 - time to polars convert: 0.7436940670013428
# 1701938933.734706 - starting polars join
# Filename: polars_test.py
# Line # Mem usage Increment Occurrences Line Contents
# =============================================================
# 32 1264.8 MiB 1264.8 MiB 1 @profile
# 33 def polars_join(pl_df1, pl_df2):
# 34 1237.6 MiB -27.2 MiB 1 return pl_df1.join(pl_df2, on=['string1', 'string2'], how='left')
# 1701938933.9606578 - time to polars join: 0.22594881057739258
# 1701938933.960665 - starting pandas convert
# Filename: polars_test.py
# Line # Mem usage Increment Occurrences Line Contents
# =============================================================
# 37 1237.7 MiB 1237.7 MiB 1 @profile
# 38 def pandas_convert(pl_df1, pl_df2):
# 39 1381.2 MiB 143.5 MiB 1 df1 = pl_df1.to_pandas()
# 40 1158.6 MiB -222.6 MiB 1 df2 = pl_df2.to_pandas()
# 41 1158.6 MiB 0.0 MiB 1 return df1, df2
# 1701938935.944544 - time to pandas convert: 1.9838781356811523
import pandas as pd
import polars as pl
from faker import Faker
from memory_profiler import memory_usage, profile
import time
def generate_dataframe(num_rows, seed=None):
fake = Faker(seed)
data = {
f'string{i}': [fake.pystr(min_chars=16, max_chars=16) for _ in range(num_rows)] for i in range(5)
}
data.update({
f'int{i}': [fake.random_int(min=0, max=1000) for _ in range(num_rows)] for i in range(5)
})
df = pd.DataFrame(data)
return df.sample(frac=1, random_state=seed) # shuffle rows
@profile
def pandas_join(df1, df2):
return df1.merge(df2, left_on=['string1', 'string2'], right_on=['string1', 'string2'], how='left')
@profile
def polars_convert(df1, df2):
pl_df1 = pl.from_pandas(df1)
pl_df2 = pl.from_pandas(df2)
return pl_df1, pl_df2
@profile
def polars_join(pl_df1, pl_df2):
return pl_df1.join(pl_df2, on=['string1', 'string2'], how='left')
@profile
def pandas_convert(pl_df1, pl_df2):
df1 = pl_df1.to_pandas()
df2 = pl_df2.to_pandas()
return df1, df2
if __name__ == "__main__":
start = time.time()
print(start, " - starting")
# Generate the DataFrames with the same seed
df1 = generate_dataframe(1_000_000, seed=42)
df2 = generate_dataframe(1_000_000, seed=42)
print(time.time(), " - time to generate dataframes: ", time.time() - start)
print(time.time(), " - starting pandas join")
start = time.time()
pandas_join(df1, df2)
print(time.time(), " - time to pandas join: ", time.time() - start)
print(time.time(), " - starting polars convert")
start = time.time()
pl_df1, pl_df2 = polars_convert(df1, df2)
print(time.time(), " - time to polars convert: ", time.time() - start)
print(time.time(), " - starting polars join")
start = time.time()
polars_join(pl_df1, pl_df2)
print(time.time(), " - time to polars join: ", time.time() - start)
print(time.time(), " - starting pandas convert")
start = time.time()
df1, df2 = pandas_convert(pl_df1, pl_df2)
print(time.time(), " - time to pandas convert: ", time.time() - start)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment