Last active
September 21, 2021 14:22
-
-
Save morkrispil/3944242494e08de4643fd42a76cb37ee to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import multiprocessing | |
from functools import partial | |
def _df_split(tup_arg, **kwargs): | |
split_ind, df_split, df_f_name = tup_arg | |
return (split_ind, getattr(df_split, df_f_name)(**kwargs)) | |
def df_multi_core(df, df_f_name, subset=None, njobs=-1, **kwargs): | |
if njobs == -1: | |
njobs = multiprocessing.cpu_count() | |
pool = multiprocessing.Pool(processes=njobs) | |
try: | |
splits = np.array_split(df[subset], njobs) | |
except ValueError: | |
splits = np.array_split(df, njobs) | |
pool_data = [(split_ind, df_split, df_f_name) for split_ind, df_split in enumerate(splits)] | |
results = pool.map(partial(_df_split, **kwargs), pool_data) | |
pool.close() | |
pool.join() | |
results = sorted(results, key=lambda x:x[0]) | |
results = pd.concat([split[1] for split in results]) | |
return results | |
# testing on "apply" and "isin" pandas functions | |
# you can also use tqdm's progress_apply (a pandas plugin function), to get nice progrees bars per each core (but slower..) | |
from time import time | |
from tqdm import tqdm | |
if __name__ == '__main__': | |
sep = '-' * 50 | |
# isin test | |
N = 10000000 | |
df = pd.DataFrame({'c1': np.random.randint(low=1, high=N, size=N), 'c2': np.arange(N)}) | |
lookfor = np.random.randint(low=1, high=N, size=1000000) | |
print('{}\ntesting pandas isin on {}\n{}'.format(sep, df.shape, sep)) | |
t1 = time() | |
print('result\n{}'.format(df.isin(lookfor).sum())) | |
t2 = time() | |
print('time for native implementation {}\n{}'.format(round(t2 - t1, 2), sep)) | |
t3 = time() | |
res = df_multi_core(df=df, df_f_name='isin', subset=['c1'], njobs=-1, values=lookfor) | |
print('result\n{}'.format(res.sum())) | |
t4 = time() | |
print('time for multi core implementation {}\n{}'.format(round(t4 - t3, 2), sep)) | |
def apply_f(row): | |
return row['c1'] + 0.1 | |
# apply / tqdm apply test | |
N = 1000000 | |
np.random.seed(0) | |
df = pd.DataFrame({'c1': np.arange(N), 'c2': np.arange(N)}) | |
print('testing pandas apply on {}\n{}'.format(df.shape, sep)) | |
t1 = time() | |
# res = df.progress_apply(apply_f, axis=1) | |
# tqdm.pandas() | |
res = df.apply(apply_f, axis=1) | |
t2 = time() | |
print('result random sample\n{}'.format(res.sample(n=3, random_state=0))) | |
print('time for native implementation {}\n{}'.format(round(t2 - t1, 2), sep)) | |
t3 = time() | |
res = df_multi_core(df=df, df_f_name='apply', subset=['c1'], njobs=-1, func=apply_f, axis=1) | |
t4 = time() | |
print('result random sample\n{}'.format(res.sample(n=3, random_state=0))) | |
print('time for multi core implementation {}\n{}'.format(round(t4 - t3, 2), sep)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment