frank-leap/yelp_pyspark_df.py

## yelp_pyspark_df.py
# copy the Hive configuration file hive-site.xml to the spark configuration folder
# sudo cp /etc/hive/conf.dist/hive-site.xml /usr/lib/spark/conf/

# launch pyspark with the spark-csv package (note: version 1.2.0 has some issues thus better use 1.3.0)
# PYSPARK_DRIVER_PYTHON=ipython pyspark --packages com.databricks:spark-csv_2.10:1.3.0

# check dataframes are working
sqlCtx.createDataFrame([("somekey", 1)])

# load yelp dataset
yelp_df=sqlCtx.load(source='com.databricks.spark.csv',header='true',inferSchema='true',path='file:////usr/lib/hue/apps/search/examples/collections/solr_configs_yelp_demo/index_data.csv')

# 1) calculate mean of the "cool" column across all of the dataset
yelp_df.select('cool').agg({'cool':'mean'}).collect()

# 2) calculate average of the "cool" column for venues with 4 "stars"
#    for records with a "review count" of 10 or more
yelp_df.filter(yelp_df.review_count >= 10).groupBy(yelp_df.stars).avg('cool').show()

# 3) calculate average of the "cool" column for venues with 5 "stars"
#    for records with a "review count" of 10 or more and records for which the venue is still open
yelp_df.filter(yelp_df.review_count >= 10).filter(yelp_df.open == 'True').groupBy(yelp_df.stars).avg('cool').show()

# 4) calculate state with 3rd highest number of reviews for venues with 5 "stars"
#    for records with a "review count" of 10 or more and records for which the venue is still open
from pyspark.sql.functions import asc,desc
yelp_df.filter(yelp_df.review_count >= 10).filter(yelp_df.open == 'True').groupBy(yelp_df.state).sum('review_count').orderBy(desc('SUM(review_count#16)')).show()

# 5) calculate maximum number of "reviews" per venue for the whole dataset
yelp_df.groupBy(yelp_df.business_id, yelp_df.review_count).count().orderBy('count').collect()
	# copy the Hive configuration file hive-site.xml to the spark configuration folder
	# sudo cp /etc/hive/conf.dist/hive-site.xml /usr/lib/spark/conf/

	# launch pyspark with the spark-csv package (note: version 1.2.0 has some issues thus better use 1.3.0)
	# PYSPARK_DRIVER_PYTHON=ipython pyspark --packages com.databricks:spark-csv_2.10:1.3.0

	# check dataframes are working
	sqlCtx.createDataFrame([("somekey", 1)])

	# load yelp dataset
	yelp_df=sqlCtx.load(source='com.databricks.spark.csv',header='true',inferSchema='true',path='file:////usr/lib/hue/apps/search/examples/collections/solr_configs_yelp_demo/index_data.csv')

	# 1) calculate mean of the "cool" column across all of the dataset
	yelp_df.select('cool').agg({'cool':'mean'}).collect()

	# 2) calculate average of the "cool" column for venues with 4 "stars"
	# for records with a "review count" of 10 or more
	yelp_df.filter(yelp_df.review_count >= 10).groupBy(yelp_df.stars).avg('cool').show()

	# 3) calculate average of the "cool" column for venues with 5 "stars"
	# for records with a "review count" of 10 or more and records for which the venue is still open
	yelp_df.filter(yelp_df.review_count >= 10).filter(yelp_df.open == 'True').groupBy(yelp_df.stars).avg('cool').show()

	# 4) calculate state with 3rd highest number of reviews for venues with 5 "stars"
	# for records with a "review count" of 10 or more and records for which the venue is still open
	from pyspark.sql.functions import asc,desc
	yelp_df.filter(yelp_df.review_count >= 10).filter(yelp_df.open == 'True').groupBy(yelp_df.state).sum('review_count').orderBy(desc('SUM(review_count#16)')).show()

	# 5) calculate maximum number of "reviews" per venue for the whole dataset
	yelp_df.groupBy(yelp_df.business_id, yelp_df.review_count).count().orderBy('count').collect()