Skip to content

Instantly share code, notes, and snippets.

@rjurney
Created April 13, 2020 20:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rjurney/5cb1fe12cab88ad16fb99c13aed8af72 to your computer and use it in GitHub Desktop.
Save rjurney/5cb1fe12cab88ad16fb99c13aed8af72 to your computer and use it in GitHub Desktop.
How to run a method on a field of a pandas DataFrame and set the result to another field
def process_split(df: pd.DataFrame, f: types.FunctionType, in_key: str, out_key: str):
"""Process each chunk of a DataFrame, apply a funtion on an in_key and store it in an out_key"""
rows = []
for index, row in df.iterrows():
result = f(row[in_key])
row[out_key] = result
rows.append(row)
df_out = pd.DataFrame(rows)
df_out = df_out.reindex().sort_index()
return df_out
def parallel_apply(
df: pd.DataFrame,
f: types.FunctionType,
in_key: str,
out_key: str,
n_cores: int=cpu_count
):
"""Apply a function to a DataFrame on an in_key and store it in an out_key with n_cores proceses"""
n_cores = int(cpu_count() / 2) if callable(cpu_count) else cpu_count
df_split = np.array_split(df, n_cores)
pool = Pool(n_cores)
df_out = pd.concat(
pool.starmap(
process_split,
repeat(
[df_split, f, in_key, out_key],
times=n_cores
),
)
)
df_out = df_out.reindex().sort_index()
pool.close()
pool.join()
return df_out
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment