adamsardar/CoW_assign.py

## CoW_assign.py
#%%

import copy
import timeit
import pandas as pd
import polars as pl

#%%

# Use pandas, but no need to make a defensive copy

pd.options.mode.copy_on_write = False

df = pd.DataFrame({"student_id": [1, 2, 3], "grade": ["A", "C", "D"]})

time_assign_col = timeit.timeit("df.assign(new_col = ['a', 'b', 'c'])", globals=globals(), number=1000)
print(f'{time_assign_col*1000:.3f} microseconds/run')
# 325.690 microseconds/run

time_assign_col = timeit.timeit("df.copy()['new_col'] = ['a', 'b', 'c']", globals=globals(), number=1000)
print(f'{time_assign_col*1000:.3f} microseconds/run')
# 359.848 microseconds/run
# This is about 10-20% slower

df2 = df.assign(new_col = ['a', 'b', 'c'])

df2.at[2,'grade'] = 'F'

print(df)
#    student_id grade
# 0           1     A
# 1           2     C
# 2           3     D

print(df2)
#    student_id grade new_col
# 0           1     A       a
# 1           2     C       b
# 2           3     F       c

# Even with copy-on-write turned off, .assign() is safe and takes a defensive copy (df is not updated, just df2)

#%%

# Use a dict, but don't make a defensive copy

def assign_col(unmodified_dict: dict) -> dict:
   unmodified_dict['new_col'] = ['a', 'b', 'c']
   return (unmodified_dict)


dd = {"student_id": [1, 2, 3], "grade": ["A", "C", "D"]}

time_assign_col = timeit.timeit('dd2 = assign_col(dd)', globals=globals(), number=1000000)
print(f'{time_assign_col:.3f} microseconds/run')
#0.253 microseconds/run
# WOW - so fast!!

dd2 = assign_col(dd)

print(dd)
# {'student_id': [1, 2, 3], 'grade': ['A', 'C', 'D'], 'new_col': ['a', 'b', 'c']}
print(dd2)
# {'student_id': [1, 2, 3], 'grade': ['A', 'C', 'D'], 'new_col': ['a', 'b', 'c']}

# OH NO!!! I've updated the original by reference. I'll have to make a defensive copy


#%%

# Use a dict, but make a defensive copy

def assign_col_copy(unmodified_dict: dict) -> dict:
   copy_of_dict = copy.deepcopy(unmodified_dict)
   copy_of_dict['new_col'] = ['a', 'b', 'c']
   return (copy_of_dict)

dd = {"student_id": [1, 2, 3], "grade": ["A", "C", "D"]}

time_assign_col = timeit.timeit('dd2 = assign_col_copy(dd)', globals=globals(), number=1000000)
print(f'{time_assign_col:.3f} microseconds/run')
#7.664 microseconds/run

dd2 = assign_col_copy(dd)

print(dd)
# {'student_id': [1, 2, 3], 'grade': ['A', 'C', 'D']}
print(dd2)
# {'student_id': [1, 2, 3], 'grade': ['A', 'C', 'D'], 'new_col': ['a', 'b', 'c']}

# This is some 20x slower, but at least we don't modify the original

#%%

# Use polars - win

pdf = pl.DataFrame({"student_id": [1, 2, 3], "grade": ["A", "C", "D"]})

time_assign_col = timeit.timeit("pdf.hstack([pl.Series('new_col', ['a', 'b', 'c'])])", globals=globals(), number=1000000)
print(f'{time_assign_col:.3f} microseconds/run')
# 10.355 microseconds/run

pdf2 = pdf.hstack([pl.Series("new_col", ['a', 'b', 'c'])])

pdf2[2, 'grade'] = 'F'

print(pdf)
# shape: (3, 2)
# ┌────────────┬───────┐
# │ student_id ┆ grade │
# │ ---        ┆ ---   │
# │ i64        ┆ str   │
# ╞════════════╪═══════╡
# │ 1          ┆ A     │
# │ 2          ┆ C     │
# │ 3          ┆ D     │
# └────────────┴───────┘
print(pdf2)
# shape: (3, 3)
# ┌────────────┬───────┬─────────┐
# │ student_id ┆ grade ┆ new_col │
# │ ---        ┆ ---   ┆ ---     │
# │ i64        ┆ str   ┆ str     │
# ╞════════════╪═══════╪═════════╡
# │ 1          ┆ A     ┆ a       │
# │ 2          ┆ C     ┆ b       │
# │ 3          ┆ F     ┆ c       │
# └────────────┴───────┴─────────┘


# base dict is around 30% faster than polars, but the timings are still comparable and 30+x quicker than pandas
	#%%

	import copy
	import timeit
	import pandas as pd
	import polars as pl

	#%%

	# Use pandas, but no need to make a defensive copy

	pd.options.mode.copy_on_write = False

	df = pd.DataFrame({"student_id": [1, 2, 3], "grade": ["A", "C", "D"]})

	time_assign_col = timeit.timeit("df.assign(new_col = ['a', 'b', 'c'])", globals=globals(), number=1000)
	print(f'{time_assign_col*1000:.3f} microseconds/run')
	# 325.690 microseconds/run

	time_assign_col = timeit.timeit("df.copy()['new_col'] = ['a', 'b', 'c']", globals=globals(), number=1000)
	print(f'{time_assign_col*1000:.3f} microseconds/run')
	# 359.848 microseconds/run
	# This is about 10-20% slower

	df2 = df.assign(new_col = ['a', 'b', 'c'])

	df2.at[2,'grade'] = 'F'

	print(df)
	# student_id grade
	# 0 1 A
	# 1 2 C
	# 2 3 D

	print(df2)
	# student_id grade new_col
	# 0 1 A a
	# 1 2 C b
	# 2 3 F c

	# Even with copy-on-write turned off, .assign() is safe and takes a defensive copy (df is not updated, just df2)

	#%%

	# Use a dict, but don't make a defensive copy

	def assign_col(unmodified_dict: dict) -> dict:
	unmodified_dict['new_col'] = ['a', 'b', 'c']
	return (unmodified_dict)


	dd = {"student_id": [1, 2, 3], "grade": ["A", "C", "D"]}

	time_assign_col = timeit.timeit('dd2 = assign_col(dd)', globals=globals(), number=1000000)
	print(f'{time_assign_col:.3f} microseconds/run')
	#0.253 microseconds/run
	# WOW - so fast!!

	dd2 = assign_col(dd)

	print(dd)
	# {'student_id': [1, 2, 3], 'grade': ['A', 'C', 'D'], 'new_col': ['a', 'b', 'c']}
	print(dd2)
	# {'student_id': [1, 2, 3], 'grade': ['A', 'C', 'D'], 'new_col': ['a', 'b', 'c']}

	# OH NO!!! I've updated the original by reference. I'll have to make a defensive copy


	#%%

	# Use a dict, but make a defensive copy

	def assign_col_copy(unmodified_dict: dict) -> dict:
	copy_of_dict = copy.deepcopy(unmodified_dict)
	copy_of_dict['new_col'] = ['a', 'b', 'c']
	return (copy_of_dict)

	dd = {"student_id": [1, 2, 3], "grade": ["A", "C", "D"]}

	time_assign_col = timeit.timeit('dd2 = assign_col_copy(dd)', globals=globals(), number=1000000)
	print(f'{time_assign_col:.3f} microseconds/run')
	#7.664 microseconds/run

	dd2 = assign_col_copy(dd)

	print(dd)
	# {'student_id': [1, 2, 3], 'grade': ['A', 'C', 'D']}
	print(dd2)
	# {'student_id': [1, 2, 3], 'grade': ['A', 'C', 'D'], 'new_col': ['a', 'b', 'c']}

	# This is some 20x slower, but at least we don't modify the original

	#%%

	# Use polars - win

	pdf = pl.DataFrame({"student_id": [1, 2, 3], "grade": ["A", "C", "D"]})

	time_assign_col = timeit.timeit("pdf.hstack([pl.Series('new_col', ['a', 'b', 'c'])])", globals=globals(), number=1000000)
	print(f'{time_assign_col:.3f} microseconds/run')
	# 10.355 microseconds/run

	pdf2 = pdf.hstack([pl.Series("new_col", ['a', 'b', 'c'])])

	pdf2[2, 'grade'] = 'F'

	print(pdf)
	# shape: (3, 2)
	# ┌────────────┬───────┐
	# │ student_id ┆ grade │
	# │ --- ┆ --- │
	# │ i64 ┆ str │
	# ╞════════════╪═══════╡
	# │ 1 ┆ A │
	# │ 2 ┆ C │
	# │ 3 ┆ D │
	# └────────────┴───────┘
	print(pdf2)
	# shape: (3, 3)
	# ┌────────────┬───────┬─────────┐
	# │ student_id ┆ grade ┆ new_col │
	# │ --- ┆ --- ┆ --- │
	# │ i64 ┆ str ┆ str │
	# ╞════════════╪═══════╪═════════╡
	# │ 1 ┆ A ┆ a │
	# │ 2 ┆ C ┆ b │
	# │ 3 ┆ F ┆ c │
	# └────────────┴───────┴─────────┘


	# base dict is around 30% faster than polars, but the timings are still comparable and 30+x quicker than pandas