Last active
March 1, 2018 13:15
-
-
Save ghandic/de8b73db6d09c24415b31ef33554681c to your computer and use it in GitHub Desktop.
Parallelizing the creating of large fake data sets, 1 million records in 60 seconds on 4 cpu, 16gb ram
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from faker import Factory | |
from multiprocessing import Pool, cpu_count | |
faker = Factory.create() | |
def create_fake_data(data): | |
data['full name'] = data['id'].map(lambda _:faker.name()) | |
data['date of birth'] = data['id'].map(lambda _:faker.date_between('-100y', 'now')) | |
data['nationality'] = data['id'].map(lambda _:faker.country()) | |
data['reference'] = data['id'].map(lambda _:faker.random_letter().upper() + str(faker.random_number(digits=7))) | |
return data | |
def parallelize_faking(df, func, partitions, core_count): | |
# Creating a deep copy of the data so we dont overwite original | |
df_copy = df.copy() | |
df_split = np.array_split(df_copy, partitions) | |
pool = Pool(core_count) | |
df_copy = pd.concat(pool.map(func, df_split)) | |
pool.close() | |
pool.join() | |
return df_copy | |
def get_fake_df(nrows): | |
data = parallelize_faking(pd.DataFrame({'id': range(nrows)}), create_fake_data, cpu_count(), min(cpu_count()*4, nrows)) | |
data.drop(['id'], axis=1, inplace = True) | |
return data | |
# Examples: | |
get_fake_df(10) | |
# Profiling example (appears to be linear) | |
from datetime import datetime | |
timings = [] | |
tests = [1,10,100,1000,10000, 100000, 1000000] | |
for i in tests: | |
starttime = datetime.now() | |
get_fake_df(i) | |
print(timings) | |
timings.append((datetime.now() - starttime).total_seconds()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment