Skip to content

Instantly share code, notes, and snippets.

@pramodvspk
Created October 9, 2016 01:47
Show Gist options
  • Save pramodvspk/ac6320ff6ad5df79b98788630aef4400 to your computer and use it in GitHub Desktop.
Save pramodvspk/ac6320ff6ad5df79b98788630aef4400 to your computer and use it in GitHub Desktop.
Benchmarking the run time of SQL queries on sql engine and Big Data systems
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, Row
#Setting the application name and creating the spark context and sql context objects.
conf = SparkConf().setAppName("SparkSQL")
sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)
#The URL where the mysql database is hosted
jdbcUrl = 'jdbcUrl'
#The list of all the dataframes which will be created
dataframes_list = ['df_departments', 'df_categories', 'df_products', 'df_orders', 'df_order_items']
#The list of all the tables from which the data has to be loaded from
sql_tables = ['departments', 'categories', 'products', 'orders', 'order_items']
for i in range (0,5):
dataframes_list[i] = sqlContext.read.format('jdbc').options(url=jdbcUrl, user='user', password='password',dbtable=sql_tables[i]).load()
dataframes_list[i].registerTempTable(sql_tables[i])
#Executing the query which returns the revenue generated by each department in a month
agg_results = sqlContext.sql("SELECT substr(o.order_date, 1,7) order_month, d.department_name, sum(oi.order_item_subtotal) FROM departments d join categories c ON d.department_id = c.category_department_id JOIN products p ON c.category_id = p.product_category_id JOIN order_items oi ON p.product_id = oi.order_item_product_id JOIN orders o ON o.order_id = oi.order_item_order_id GROUP BY substr(o.order_date, 1,7), d.department_name ORDER BY order_month, department_name")
for res in agg_results.collect():
print(res)
#Sample results:
'''
Row(order_month=u'2013-07', department_name=u'Golf', _c2=101087.26000000005)
Row(order_month=u'2013-07', department_name=u'Outdoors', _c2=20877.519999999993)
Row(order_month=u'2013-07', department_name=u'Testing Merge', _c2=5622.149999999998)
Row(order_month=u'2013-07', department_name=u'data', _c2=163108.2700000002)
Row(order_month=u'2013-07', department_name=u'fanshoppy', _c2=389162.09)
Row(order_month=u'2013-07', department_name=u'footwear', _c2=84624.92000000001)
Row(order_month=u'2013-08', department_name=u'Golf', _c2=391535.7299999997)
Row(order_month=u'2013-08', department_name=u'Outdoors', _c2=82849.71999999997)
Row(order_month=u'2013-08', department_name=u'Testing Merge', _c2=23719.610000000008)
Row(order_month=u'2013-08', department_name=u'data', _c2=598981.2099999998)
Row(order_month=u'2013-08', department_name=u'fanshoppy', _c2=1410128.6399999997)
Row(order_month=u'2013-08', department_name=u'footwear', _c2=321443.78999999986)
Row(order_month=u'2013-09', department_name=u'Golf', _c2=388137.44)
Row(order_month=u'2013-09', department_name=u'Outdoors', _c2=84080.7)
Row(order_month=u'2013-09', department_name=u'Testing Merge', _c2=24377.450000000004)
Row(order_month=u'2013-09', department_name=u'data', _c2=615739.5500000002)
Row(order_month=u'2013-09', department_name=u'fanshoppy', _c2=1452927.6799999995)
Row(order_month=u'2013-09', department_name=u'footwear', _c2=369264.4499999999)
#The execution of the query using different engines has taken the following times
1. Executing the query using mySQL engine : 0.49 seconds
2. Executing the query on hive using Map Reduce engine which exececutes the query by running Map Reduce jobs in background: 52.085 seconds
3. Executing the query on hive using Tez as execution engine in background: 25.63 seconds
4. Executing the query using SparkSQL by loading the tables as Data Frames, it uses Apache Spark catalyst optimizer for running the query: 4.2 seconds
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment