Skip to content

Instantly share code, notes, and snippets.

@hughdbrown
Last active July 5, 2023 04:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hughdbrown/2f125de201ef70e75c8ca6af95834e8b to your computer and use it in GitHub Desktop.
Save hughdbrown/2f125de201ef70e75c8ca6af95834e8b to your computer and use it in GitHub Desktop.
Fast way to do lookup conversion in pandas
#!/usr/bin/env python3
from pprint import pprint
from random import randint
from datetime import datetime
from uuid import uuid4
import pandas as pd
NUMS = 10000
def time_decorator(func):
def wrapper_function(*args, **kwargs):
start = datetime.now()
result = func(*args, **kwargs)
end = datetime.now()
print(f"{'-' * 30} {func.__name__!r}: {end - start} seconds")
return result
return wrapper_function
@time_decorator
def create_raw_lookup(n):
return {
(10 * i, 10 * i + 9): str(uuid4())
for i in range(n)
}
@time_decorator
def create_expanded_lookup(raw_lookup):
return {
i: val
for (lower, upper), val in raw_lookup.items()
for i in range(lower, upper + 1)
}
@time_decorator
def create_df():
# random.randint(x, y) creates values between x and y inclusive,
# so we subtract 1 from end.
return pd.DataFrame(data=
[
{"zip": randint(0, 100000 - 1)}
for _ in range(1000000)
]
)
@time_decorator
def apply_conversion(df, lookup):
# for DataFrame.assign:
# https://tomaugspurger.net/posts/method-chaining/
# for Series.map with DataFrame.assign::
# https://www.sharpsightlabs.com/blog/pandas-assign/
return df.assign(value=df.zip.map(lookup))
def main():
# Make a sparse representation
raw_lookup = create_raw_lookup(NUMS)
pprint(list(raw_lookup.items())[:5])
# Convert to a dense representation
lookup = create_expanded_lookup(raw_lookup)
pprint(list(lookup.items())[:20])
# Make a DataFrame with random zipcodes
df = create_df()
# Use the dense representation to add a column to the DataFrame
df = apply_conversion(df, lookup)
print(df.describe())
print(df.head())
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment