Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
import pandas as pd
import numpy as np
import multiprocessing
from functools import partial
def _df_split(tup_arg, **kwargs):
split_ind, df_split, df_f_name = tup_arg
return (split_ind, getattr(df_split, df_f_name)(**kwargs))
def df_multi_core(df, df_f_name, subset=None, njobs=-1, **kwargs):
if njobs == -1:
njobs = multiprocessing.cpu_count()
pool = multiprocessing.Pool(processes=njobs)
try:
splits = np.array_split(df[subset], njobs)
except ValueError:
splits = np.array_split(df, njobs)
pool_data = [(split_ind, df_split, df_f_name) for split_ind, df_split in enumerate(splits)]
results = pool.map(partial(_df_split, **kwargs), pool_data)
pool.close()
pool.join()
results = sorted(results, key=lambda x:x[0])
results = pd.concat([split[1] for split in results])
return results
# testing on "apply" and "isin" pandas functions
# you can also use tqdm's progress_apply (a pandas plugin function), to get nice progrees bars per each core (but slower..)
from time import time
from tqdm import tqdm
if __name__ == '__main__':
sep = '-' * 50
# isin test
N = 10000000
df = pd.DataFrame({'c1': np.random.randint(low=1, high=N, size=N), 'c2': np.arange(N)})
lookfor = np.random.randint(low=1, high=N, size=1000000)
print('{}\ntesting pandas isin on {}\n{}'.format(sep, df.shape, sep))
t1 = time()
print('result\n{}'.format(df.isin(lookfor).sum()))
t2 = time()
print('time for native implementation {}\n{}'.format(round(t2 - t1, 2), sep))
t3 = time()
res = df_multi_core(df=df, df_f_name='isin', subset=['c1'], njobs=-1, values=lookfor)
print('result\n{}'.format(res.sum()))
t4 = time()
print('time for multi core implementation {}\n{}'.format(round(t4 - t3, 2), sep))
def apply_f(row):
return row['c1'] + 0.1
# apply / tqdm apply test
N = 1000000
np.random.seed(0)
df = pd.DataFrame({'c1': np.arange(N), 'c2': np.arange(N)})
print('testing pandas apply on {}\n{}'.format(df.shape, sep))
t1 = time()
# res = df.progress_apply(apply_f, axis=1)
# tqdm.pandas()
res = df.apply(apply_f, axis=1)
t2 = time()
print('result random sample\n{}'.format(res.sample(n=3, random_state=0)))
print('time for native implementation {}\n{}'.format(round(t2 - t1, 2), sep))
t3 = time()
res = df_multi_core(df=df, df_f_name='apply', subset=['c1'], njobs=-1, func=apply_f, axis=1)
t4 = time()
print('result random sample\n{}'.format(res.sample(n=3, random_state=0)))
print('time for multi core implementation {}\n{}'.format(round(t4 - t3, 2), sep))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.