Skip to content

Instantly share code, notes, and snippets.

@adamsardar
Last active March 8, 2024 16:40
Show Gist options
  • Save adamsardar/068f0c00e4bac64ffb7ea40cfc109f4d to your computer and use it in GitHub Desktop.
Save adamsardar/068f0c00e4bac64ffb7ea40cfc109f4d to your computer and use it in GitHub Desktop.
Column creation and copies
#%%
import copy
import timeit
import pandas as pd
import polars as pl
#%%
# Use pandas, but no need to make a defensive copy
pd.options.mode.copy_on_write = False
df = pd.DataFrame({"student_id": [1, 2, 3], "grade": ["A", "C", "D"]})
time_assign_col = timeit.timeit("df.assign(new_col = ['a', 'b', 'c'])", globals=globals(), number=1000)
print(f'{time_assign_col*1000:.3f} microseconds/run')
# 325.690 microseconds/run
time_assign_col = timeit.timeit("df.copy()['new_col'] = ['a', 'b', 'c']", globals=globals(), number=1000)
print(f'{time_assign_col*1000:.3f} microseconds/run')
# 359.848 microseconds/run
# This is about 10-20% slower
df2 = df.assign(new_col = ['a', 'b', 'c'])
df2.at[2,'grade'] = 'F'
print(df)
# student_id grade
# 0 1 A
# 1 2 C
# 2 3 D
print(df2)
# student_id grade new_col
# 0 1 A a
# 1 2 C b
# 2 3 F c
# Even with copy-on-write turned off, .assign() is safe and takes a defensive copy (df is not updated, just df2)
#%%
# Use a dict, but don't make a defensive copy
def assign_col(unmodified_dict: dict) -> dict:
unmodified_dict['new_col'] = ['a', 'b', 'c']
return (unmodified_dict)
dd = {"student_id": [1, 2, 3], "grade": ["A", "C", "D"]}
time_assign_col = timeit.timeit('dd2 = assign_col(dd)', globals=globals(), number=1000000)
print(f'{time_assign_col:.3f} microseconds/run')
#0.253 microseconds/run
# WOW - so fast!!
dd2 = assign_col(dd)
print(dd)
# {'student_id': [1, 2, 3], 'grade': ['A', 'C', 'D'], 'new_col': ['a', 'b', 'c']}
print(dd2)
# {'student_id': [1, 2, 3], 'grade': ['A', 'C', 'D'], 'new_col': ['a', 'b', 'c']}
# OH NO!!! I've updated the original by reference. I'll have to make a defensive copy
#%%
# Use a dict, but make a defensive copy
def assign_col_copy(unmodified_dict: dict) -> dict:
copy_of_dict = copy.deepcopy(unmodified_dict)
copy_of_dict['new_col'] = ['a', 'b', 'c']
return (copy_of_dict)
dd = {"student_id": [1, 2, 3], "grade": ["A", "C", "D"]}
time_assign_col = timeit.timeit('dd2 = assign_col_copy(dd)', globals=globals(), number=1000000)
print(f'{time_assign_col:.3f} microseconds/run')
#7.664 microseconds/run
dd2 = assign_col_copy(dd)
print(dd)
# {'student_id': [1, 2, 3], 'grade': ['A', 'C', 'D']}
print(dd2)
# {'student_id': [1, 2, 3], 'grade': ['A', 'C', 'D'], 'new_col': ['a', 'b', 'c']}
# This is some 20x slower, but at least we don't modify the original
#%%
# Use polars - win
pdf = pl.DataFrame({"student_id": [1, 2, 3], "grade": ["A", "C", "D"]})
time_assign_col = timeit.timeit("pdf.hstack([pl.Series('new_col', ['a', 'b', 'c'])])", globals=globals(), number=1000000)
print(f'{time_assign_col:.3f} microseconds/run')
# 10.355 microseconds/run
pdf2 = pdf.hstack([pl.Series("new_col", ['a', 'b', 'c'])])
pdf2[2, 'grade'] = 'F'
print(pdf)
# shape: (3, 2)
# ┌────────────┬───────┐
# │ student_id ┆ grade │
# │ --- ┆ --- │
# │ i64 ┆ str │
# ╞════════════╪═══════╡
# │ 1 ┆ A │
# │ 2 ┆ C │
# │ 3 ┆ D │
# └────────────┴───────┘
print(pdf2)
# shape: (3, 3)
# ┌────────────┬───────┬─────────┐
# │ student_id ┆ grade ┆ new_col │
# │ --- ┆ --- ┆ --- │
# │ i64 ┆ str ┆ str │
# ╞════════════╪═══════╪═════════╡
# │ 1 ┆ A ┆ a │
# │ 2 ┆ C ┆ b │
# │ 3 ┆ F ┆ c │
# └────────────┴───────┴─────────┘
# base dict is around 30% faster than polars, but the timings are still comparable and 30+x quicker than pandas
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment