Skip to content

Instantly share code, notes, and snippets.

@ghandic
Last active March 1, 2018 13:15
Show Gist options
  • Save ghandic/de8b73db6d09c24415b31ef33554681c to your computer and use it in GitHub Desktop.
Save ghandic/de8b73db6d09c24415b31ef33554681c to your computer and use it in GitHub Desktop.
Parallelizing the creating of large fake data sets, 1 million records in 60 seconds on 4 cpu, 16gb ram
import numpy as np
import pandas as pd
from faker import Factory
from multiprocessing import Pool, cpu_count
faker = Factory.create()
def create_fake_data(data):
data['full name'] = data['id'].map(lambda _:faker.name())
data['date of birth'] = data['id'].map(lambda _:faker.date_between('-100y', 'now'))
data['nationality'] = data['id'].map(lambda _:faker.country())
data['reference'] = data['id'].map(lambda _:faker.random_letter().upper() + str(faker.random_number(digits=7)))
return data
def parallelize_faking(df, func, partitions, core_count):
# Creating a deep copy of the data so we dont overwite original
df_copy = df.copy()
df_split = np.array_split(df_copy, partitions)
pool = Pool(core_count)
df_copy = pd.concat(pool.map(func, df_split))
pool.close()
pool.join()
return df_copy
def get_fake_df(nrows):
data = parallelize_faking(pd.DataFrame({'id': range(nrows)}), create_fake_data, cpu_count(), min(cpu_count()*4, nrows))
data.drop(['id'], axis=1, inplace = True)
return data
# Examples:
get_fake_df(10)
# Profiling example (appears to be linear)
from datetime import datetime
timings = []
tests = [1,10,100,1000,10000, 100000, 1000000]
for i in tests:
starttime = datetime.now()
get_fake_df(i)
print(timings)
timings.append((datetime.now() - starttime).total_seconds())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment