Last active
March 8, 2024 16:40
-
-
Save adamsardar/068f0c00e4bac64ffb7ea40cfc109f4d to your computer and use it in GitHub Desktop.
Column creation and copies
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#%% | |
import copy | |
import timeit | |
import pandas as pd | |
import polars as pl | |
#%% | |
# Use pandas, but no need to make a defensive copy | |
pd.options.mode.copy_on_write = False | |
df = pd.DataFrame({"student_id": [1, 2, 3], "grade": ["A", "C", "D"]}) | |
time_assign_col = timeit.timeit("df.assign(new_col = ['a', 'b', 'c'])", globals=globals(), number=1000) | |
print(f'{time_assign_col*1000:.3f} microseconds/run') | |
# 325.690 microseconds/run | |
time_assign_col = timeit.timeit("df.copy()['new_col'] = ['a', 'b', 'c']", globals=globals(), number=1000) | |
print(f'{time_assign_col*1000:.3f} microseconds/run') | |
# 359.848 microseconds/run | |
# This is about 10-20% slower | |
df2 = df.assign(new_col = ['a', 'b', 'c']) | |
df2.at[2,'grade'] = 'F' | |
print(df) | |
# student_id grade | |
# 0 1 A | |
# 1 2 C | |
# 2 3 D | |
print(df2) | |
# student_id grade new_col | |
# 0 1 A a | |
# 1 2 C b | |
# 2 3 F c | |
# Even with copy-on-write turned off, .assign() is safe and takes a defensive copy (df is not updated, just df2) | |
#%% | |
# Use a dict, but don't make a defensive copy | |
def assign_col(unmodified_dict: dict) -> dict: | |
unmodified_dict['new_col'] = ['a', 'b', 'c'] | |
return (unmodified_dict) | |
dd = {"student_id": [1, 2, 3], "grade": ["A", "C", "D"]} | |
time_assign_col = timeit.timeit('dd2 = assign_col(dd)', globals=globals(), number=1000000) | |
print(f'{time_assign_col:.3f} microseconds/run') | |
#0.253 microseconds/run | |
# WOW - so fast!! | |
dd2 = assign_col(dd) | |
print(dd) | |
# {'student_id': [1, 2, 3], 'grade': ['A', 'C', 'D'], 'new_col': ['a', 'b', 'c']} | |
print(dd2) | |
# {'student_id': [1, 2, 3], 'grade': ['A', 'C', 'D'], 'new_col': ['a', 'b', 'c']} | |
# OH NO!!! I've updated the original by reference. I'll have to make a defensive copy | |
#%% | |
# Use a dict, but make a defensive copy | |
def assign_col_copy(unmodified_dict: dict) -> dict: | |
copy_of_dict = copy.deepcopy(unmodified_dict) | |
copy_of_dict['new_col'] = ['a', 'b', 'c'] | |
return (copy_of_dict) | |
dd = {"student_id": [1, 2, 3], "grade": ["A", "C", "D"]} | |
time_assign_col = timeit.timeit('dd2 = assign_col_copy(dd)', globals=globals(), number=1000000) | |
print(f'{time_assign_col:.3f} microseconds/run') | |
#7.664 microseconds/run | |
dd2 = assign_col_copy(dd) | |
print(dd) | |
# {'student_id': [1, 2, 3], 'grade': ['A', 'C', 'D']} | |
print(dd2) | |
# {'student_id': [1, 2, 3], 'grade': ['A', 'C', 'D'], 'new_col': ['a', 'b', 'c']} | |
# This is some 20x slower, but at least we don't modify the original | |
#%% | |
# Use polars - win | |
pdf = pl.DataFrame({"student_id": [1, 2, 3], "grade": ["A", "C", "D"]}) | |
time_assign_col = timeit.timeit("pdf.hstack([pl.Series('new_col', ['a', 'b', 'c'])])", globals=globals(), number=1000000) | |
print(f'{time_assign_col:.3f} microseconds/run') | |
# 10.355 microseconds/run | |
pdf2 = pdf.hstack([pl.Series("new_col", ['a', 'b', 'c'])]) | |
pdf2[2, 'grade'] = 'F' | |
print(pdf) | |
# shape: (3, 2) | |
# ┌────────────┬───────┐ | |
# │ student_id ┆ grade │ | |
# │ --- ┆ --- │ | |
# │ i64 ┆ str │ | |
# ╞════════════╪═══════╡ | |
# │ 1 ┆ A │ | |
# │ 2 ┆ C │ | |
# │ 3 ┆ D │ | |
# └────────────┴───────┘ | |
print(pdf2) | |
# shape: (3, 3) | |
# ┌────────────┬───────┬─────────┐ | |
# │ student_id ┆ grade ┆ new_col │ | |
# │ --- ┆ --- ┆ --- │ | |
# │ i64 ┆ str ┆ str │ | |
# ╞════════════╪═══════╪═════════╡ | |
# │ 1 ┆ A ┆ a │ | |
# │ 2 ┆ C ┆ b │ | |
# │ 3 ┆ F ┆ c │ | |
# └────────────┴───────┴─────────┘ | |
# base dict is around 30% faster than polars, but the timings are still comparable and 30+x quicker than pandas |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment