Skip to content

Instantly share code, notes, and snippets.

@ma7555
Created June 3, 2021 09:36
Show Gist options
  • Save ma7555/ccba29a0ccf9f2bd5a73d3dd6fafae00 to your computer and use it in GitHub Desktop.
Save ma7555/ccba29a0ccf9f2bd5a73d3dd6fafae00 to your computer and use it in GitHub Desktop.
Parallel Pandas Apply
import pandas as pd
import time
from multiprocessing import Pool, cpu_count, freeze_support
import numpy as np
import os
def timeit(method):
def timed(*args, **kw):
ts = time.time()
result = method(*args, **kw)
te = time.time()
if 'log_time' in kw:
name = kw.get('log_name', method.__name__.upper())
kw['log_time'][name] = int((te - ts) * 1000)
else:
print('{} {:.2f} ms'.format(method.__name__, (te - ts)))
return result
return timed
@timeit
def parallelize_dataframe(df, func, n_cores=16):
df_split = np.array_split(df, n_cores)
pool = Pool(n_cores)
df = pd.concat(pool.map(func, df_split))
pool.close()
pool.join()
return df
def to_array(df):
df['array'] = df.list.apply(np.array)
return df
if __name__ == '__main__':
df = pd.DataFrame(data={'list': [[0, 0]] * 500})
df = parallelize_dataframe(df, to_array)
print(df.head())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment