pramodvspk/sparkSQL.py

## sparkSQL.py
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, Row

#Setting the application name and creating the spark context and sql context objects.
conf = SparkConf().setAppName("SparkSQL")
sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)

#The URL where the mysql database is hosted
jdbcUrl = 'jdbcUrl'

#The list of all the dataframes which will be created
dataframes_list = ['df_departments', 'df_categories', 'df_products', 'df_orders', 'df_order_items']
#The list of all the tables from which the data has to be loaded from
sql_tables = ['departments', 'categories', 'products', 'orders', 'order_items']

for i in range (0,5):
  dataframes_list[i] = sqlContext.read.format('jdbc').options(url=jdbcUrl, user='user', password='password',dbtable=sql_tables[i]).load()
  dataframes_list[i].registerTempTable(sql_tables[i])


#Executing the query which returns the revenue generated by each department in a month
agg_results = sqlContext.sql("SELECT substr(o.order_date, 1,7) order_month, d.department_name, sum(oi.order_item_subtotal) FROM departments d join categories c ON d.department_id = c.category_department_id JOIN products p ON c.category_id = p.product_category_id JOIN order_items oi ON p.product_id = oi.order_item_product_id JOIN orders o ON o.order_id = oi.order_item_order_id GROUP BY substr(o.order_date, 1,7), d.department_name ORDER BY order_month, department_name")

for res in agg_results.collect():
	print(res)

#Sample results:
'''
Row(order_month=u'2013-07', department_name=u'Golf', _c2=101087.26000000005)
Row(order_month=u'2013-07', department_name=u'Outdoors', _c2=20877.519999999993)
Row(order_month=u'2013-07', department_name=u'Testing Merge', _c2=5622.149999999998)
Row(order_month=u'2013-07', department_name=u'data', _c2=163108.2700000002)
Row(order_month=u'2013-07', department_name=u'fanshoppy', _c2=389162.09)
Row(order_month=u'2013-07', department_name=u'footwear', _c2=84624.92000000001)
Row(order_month=u'2013-08', department_name=u'Golf', _c2=391535.7299999997)
Row(order_month=u'2013-08', department_name=u'Outdoors', _c2=82849.71999999997)
Row(order_month=u'2013-08', department_name=u'Testing Merge', _c2=23719.610000000008)
Row(order_month=u'2013-08', department_name=u'data', _c2=598981.2099999998)
Row(order_month=u'2013-08', department_name=u'fanshoppy', _c2=1410128.6399999997)
Row(order_month=u'2013-08', department_name=u'footwear', _c2=321443.78999999986)
Row(order_month=u'2013-09', department_name=u'Golf', _c2=388137.44)
Row(order_month=u'2013-09', department_name=u'Outdoors', _c2=84080.7)
Row(order_month=u'2013-09', department_name=u'Testing Merge', _c2=24377.450000000004)
Row(order_month=u'2013-09', department_name=u'data', _c2=615739.5500000002)
Row(order_month=u'2013-09', department_name=u'fanshoppy', _c2=1452927.6799999995)
Row(order_month=u'2013-09', department_name=u'footwear', _c2=369264.4499999999)

#The execution of the query using different engines has taken the following times
1. Executing the query using mySQL engine : 0.49 seconds
2. Executing the query on hive using Map Reduce engine which exececutes the query by running Map Reduce jobs in background: 52.085 seconds
3. Executing the query on hive using Tez as execution engine in background: 25.63 seconds
4. Executing the query using SparkSQL by loading the tables as Data Frames, it uses Apache Spark catalyst optimizer for running the query: 4.2 seconds
'''
	from pyspark import SparkConf, SparkContext
	from pyspark.sql import SQLContext, Row

	#Setting the application name and creating the spark context and sql context objects.
	conf = SparkConf().setAppName("SparkSQL")
	sc = SparkContext(conf = conf)
	sqlContext = SQLContext(sc)

	#The URL where the mysql database is hosted
	jdbcUrl = 'jdbcUrl'

	#The list of all the dataframes which will be created
	dataframes_list = ['df_departments', 'df_categories', 'df_products', 'df_orders', 'df_order_items']
	#The list of all the tables from which the data has to be loaded from
	sql_tables = ['departments', 'categories', 'products', 'orders', 'order_items']

	for i in range (0,5):
	dataframes_list[i] = sqlContext.read.format('jdbc').options(url=jdbcUrl, user='user', password='password',dbtable=sql_tables[i]).load()
	dataframes_list[i].registerTempTable(sql_tables[i])


	#Executing the query which returns the revenue generated by each department in a month
	agg_results = sqlContext.sql("SELECT substr(o.order_date, 1,7) order_month, d.department_name, sum(oi.order_item_subtotal) FROM departments d join categories c ON d.department_id = c.category_department_id JOIN products p ON c.category_id = p.product_category_id JOIN order_items oi ON p.product_id = oi.order_item_product_id JOIN orders o ON o.order_id = oi.order_item_order_id GROUP BY substr(o.order_date, 1,7), d.department_name ORDER BY order_month, department_name")

	for res in agg_results.collect():
	print(res)

	#Sample results:
	'''
	Row(order_month=u'2013-07', department_name=u'Golf', _c2=101087.26000000005)
	Row(order_month=u'2013-07', department_name=u'Outdoors', _c2=20877.519999999993)
	Row(order_month=u'2013-07', department_name=u'Testing Merge', _c2=5622.149999999998)
	Row(order_month=u'2013-07', department_name=u'data', _c2=163108.2700000002)
	Row(order_month=u'2013-07', department_name=u'fanshoppy', _c2=389162.09)
	Row(order_month=u'2013-07', department_name=u'footwear', _c2=84624.92000000001)
	Row(order_month=u'2013-08', department_name=u'Golf', _c2=391535.7299999997)
	Row(order_month=u'2013-08', department_name=u'Outdoors', _c2=82849.71999999997)
	Row(order_month=u'2013-08', department_name=u'Testing Merge', _c2=23719.610000000008)
	Row(order_month=u'2013-08', department_name=u'data', _c2=598981.2099999998)
	Row(order_month=u'2013-08', department_name=u'fanshoppy', _c2=1410128.6399999997)
	Row(order_month=u'2013-08', department_name=u'footwear', _c2=321443.78999999986)
	Row(order_month=u'2013-09', department_name=u'Golf', _c2=388137.44)
	Row(order_month=u'2013-09', department_name=u'Outdoors', _c2=84080.7)
	Row(order_month=u'2013-09', department_name=u'Testing Merge', _c2=24377.450000000004)
	Row(order_month=u'2013-09', department_name=u'data', _c2=615739.5500000002)
	Row(order_month=u'2013-09', department_name=u'fanshoppy', _c2=1452927.6799999995)
	Row(order_month=u'2013-09', department_name=u'footwear', _c2=369264.4499999999)

	#The execution of the query using different engines has taken the following times
	1. Executing the query using mySQL engine : 0.49 seconds
	2. Executing the query on hive using Map Reduce engine which exececutes the query by running Map Reduce jobs in background: 52.085 seconds
	3. Executing the query on hive using Tez as execution engine in background: 25.63 seconds
	4. Executing the query using SparkSQL by loading the tables as Data Frames, it uses Apache Spark catalyst optimizer for running the query: 4.2 seconds
	'''