Skip to content

Instantly share code, notes, and snippets.

@gilbertfrancois
Last active March 1, 2022 13:30
Show Gist options
  • Save gilbertfrancois/559d23dc59b63e8eaa4442130d371e58 to your computer and use it in GitHub Desktop.
Save gilbertfrancois/559d23dc59b63e8eaa4442130d371e58 to your computer and use it in GitHub Desktop.
Apply a function to a column of a pandas DataFrame with multicore support.
import numpy as np
import pandas as pd
from functools import partial
from multiprocessing import Pool
def apply_multicore(df, n_workers, fn, **kwargs):
"""
Apply a function to a column of a pandas DataFrame, with multicore support.
Parameters
----------
df: pd.DataFrame
Input DataFrame
n_workers: int
Number of workers
fn: function
Function reference
**kwargs: dict
Function arguments (optional)
Returns
-------
pd.DataFrame
Output DataFrame
Example
-------
>>> import pandas as pd
>>> import numpy as np
>>> from functools import partial
>>> from multiprocessing import Pool
>>>
>>> def test_fn(df, column_name, a):
... # Some arbitrary function.
... df[column_name] = df[column_name].apply(lambda x: x + a)
... return df
...
>>> # Fix random seed
>>> np.random.seed(1)
>>> # Create DataFrame and some random data.
>>> df = pd.DataFrame()
>>> df["col_a"] = np.random.rand(100)
>>> df.head()
col_a
0 0.417022
1 0.720324
2 0.000114
3 0.302333
4 0.146756
>>> # Add 100 to every value in the column 'col_a' using 4 workers.
>>> df = apply_multicore(df, 4, test_fn, column_name="col_a", a=100)
>>> df.head()
col_a
0 100.417022
1 100.720324
2 100.000114
3 100.302333
4 100.146756
"""
df_batches = np.array_split(df, n_workers)
with Pool(n_workers) as pool:
_df = pd.concat(pool.map(partial(fn, **kwargs), df_batches))
return _df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment