-
-
Save geosmart/28d313d5d1d468ffe72f0390ed900fb0 to your computer and use it in GitHub Desktop.
Parallelize apply after pandas groupby using PySpark
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
# Spark context | |
import pyspark | |
sc = pyspark.SparkContext() | |
# apply parallel | |
def applyParallel(dfGrouped, func): | |
# rdd with the group of dataframes | |
groups = [group for name, group in dfGrouped] | |
names = [name for name, group in dfGrouped] | |
dummy_rdd = sc.parallelize(groups) | |
# assuming that func(pandas dataframe) returns a series, the following is a list of pandas series | |
ret_list = dummy_rdd.map(func).collect() | |
# concatenate them in a pandas dataframe and return | |
result = pd.concat([S.to_frame().transpose() for S in ret_list]) | |
result.index = names | |
return result | |
# Example: | |
########## | |
def f(g): | |
return pd.Series({'nrows':g.shape[0],'ncols':g.shape[1]}) | |
pepe = pd.DataFrame({'a':['q1','q1','q2','q3','q4','q4','q4','q3'],'b':[3,5,3,6,2,4,3,5]}) | |
juan = applyParallel(pepe.groupby('a'), f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment