Skip to content

Instantly share code, notes, and snippets.

@hughdbrown
Last active March 17, 2023 17:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hughdbrown/6d8bd9b5e7a6918751cc77180d50b272 to your computer and use it in GitHub Desktop.
Save hughdbrown/6d8bd9b5e7a6918751cc77180d50b272 to your computer and use it in GitHub Desktop.
Test of numpy.vectorized function versus pandas.apply
#!/usr/bin/env python3
# Validation of code in Mewdium article:
# https://medium.com/the-modern-scientist/make-pandas-code-120x-faster-a-forbidden-mathematical-jutsu-87103030eb9c
import timeit
import pandas as pd
import numpy as np
sizes = [1_000, 10_000, 100_000, 1_000_000]
def generate_df(size):
df = pd.DataFrame()
df['age'] = np.random.randint(1,100,size)
df['avg_sleeping'] = np.random.randint(1,24, size)
df['gender'] = np.random.choice(['Male','Female'], size)
df['annual_income'] = np.random.randint(1000,100000, size)
df['phone_number'] = np.random.randint(1_111_111_111, 9_999_999_999, size)
df['favourite_food'] = np.random.choice(['pizza', 'burger', 'chips', 'nachos'], size)
return df
def reward_function(row):
total_bonus = 10
if (row['avg_sleeping'] >= 6) and (5000 <= row['annual_income'] <= 10000):
total_bonus += 10
if (60 <= row['age'] <= 90):
total_bonus += 20 if (row["gender"] == "Female") else 18
return row['annual_income'] * (total_bonus / 100)
def wrapper(func, *args, **kwargs):
def wrapped():
return func(*args, **kwargs)
return wrapped
def apply_function(df):
df['reward'] = df.apply(reward_function, axis=1)
return df
def test_apply():
for size in sizes:
df = generate_df(size)
wrap = wrapper(apply_function, df)
n = timeit.timeit(wrap, number = 10)
print(f'Size: {size} | Time: {n}')
def reward_function_part(avg_sleeping, annual_income, gender, age):
total_bonus = 10
if (avg_sleeping >= 6) and (5000 <= annual_income <= 10000):
total_bonus += 10
if (60<= age <=90):
total_bonus = np.where(gender == "Female", 20, 18)
return annual_income * (total_bonus / 100)
def vectorize_function(df):
df["reward"] = np.vectorize(reward_function_part)(
df['avg_sleeping'], df['annual_income'], df['gender'], df['age']
)
return df
def test_vectorize():
for size in sizes:
df = generate_df(size)
wrap = wrapper(vectorize_function, df)
n = timeit.timeit(wrap, number = 10)
print(f'Size: {size} | Time: {n}')
if __name__ == '__main__':
test_apply()
test_vectorize()
@hughdbrown
Copy link
Author

The numbers I get with the fourth revision are:

apply
Size: 1000 | Time: 0.158108046
Size: 10000 | Time: 1.460299413
Size: 100000 | Time: 15.09842858
Size: 1000000 | Time: 152.397236201

vectorize
Size: 1000 | Time: 0.02020495899998309
Size: 10000 | Time: 0.1365422229999922
Size: 100000 | Time: 1.258238623000011
Size: 1000000 | Time: 12.476223318999985

7-13 times speedup

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment