geosmart/parallel_groupby_apply.py

## parallel_groupby_apply.py
import pandas as pd

# Spark context
import pyspark
sc = pyspark.SparkContext()

# apply parallel
def applyParallel(dfGrouped, func):

    # rdd with the group of dataframes
    groups = [group for name, group in dfGrouped]
    names = [name for name, group in dfGrouped]
    dummy_rdd = sc.parallelize(groups)

    # assuming that func(pandas dataframe) returns a series, the following is a list of pandas series
    ret_list = dummy_rdd.map(func).collect()

    # concatenate them in a pandas dataframe and return
    result = pd.concat([S.to_frame().transpose() for S in ret_list])
    result.index = names

    return result

# Example:
##########
def f(g):
  return pd.Series({'nrows':g.shape[0],'ncols':g.shape[1]})

pepe = pd.DataFrame({'a':['q1','q1','q2','q3','q4','q4','q4','q3'],'b':[3,5,3,6,2,4,3,5]})
juan = applyParallel(pepe.groupby('a'), f)
	import pandas as pd

	# Spark context
	import pyspark
	sc = pyspark.SparkContext()

	# apply parallel
	def applyParallel(dfGrouped, func):

	# rdd with the group of dataframes
	groups = [group for name, group in dfGrouped]
	names = [name for name, group in dfGrouped]
	dummy_rdd = sc.parallelize(groups)

	# assuming that func(pandas dataframe) returns a series, the following is a list of pandas series
	ret_list = dummy_rdd.map(func).collect()

	# concatenate them in a pandas dataframe and return
	result = pd.concat([S.to_frame().transpose() for S in ret_list])
	result.index = names

	return result

	# Example:
	##########
	def f(g):
	return pd.Series({'nrows':g.shape[0],'ncols':g.shape[1]})

	pepe = pd.DataFrame({'a':['q1','q1','q2','q3','q4','q4','q4','q3'],'b':[3,5,3,6,2,4,3,5]})
	juan = applyParallel(pepe.groupby('a'), f)