Skip to content

Instantly share code, notes, and snippets.

Last active Oct 20, 2017
What would you like to do?
import pandas
from dask import dataframe
from dask.diagnostics import ProgressBar
def parallel_apply(df, func, progress=True, chunkrows=100, scheduler_address=None, *args, **kwargs):
if scheduler_address:
from dask.distributed import Client
client = Client(scheduler_address)
sd = dataframe.from_pandas(df, npartitions=int(len(df)/chunkrows))
if progress:
with ProgressBar():
return sd.apply(func, *args, **kwargs).compute()
return sd.apply(func, *args, **kwargs).compute()
# For testing progress bar :)
test_df = pandas.DataFrame({'x': range(1000000), 'y': range(1000000)[::-1]})
test_df['z'] = parallel_apply(test_df, lambda row: row['x'] * row['y'], axis=1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment