Created
October 9, 2016 01:47
-
-
Save pramodvspk/ac6320ff6ad5df79b98788630aef4400 to your computer and use it in GitHub Desktop.
Benchmarking the run time of SQL queries on sql engine and Big Data systems
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark import SparkConf, SparkContext | |
from pyspark.sql import SQLContext, Row | |
#Setting the application name and creating the spark context and sql context objects. | |
conf = SparkConf().setAppName("SparkSQL") | |
sc = SparkContext(conf = conf) | |
sqlContext = SQLContext(sc) | |
#The URL where the mysql database is hosted | |
jdbcUrl = 'jdbcUrl' | |
#The list of all the dataframes which will be created | |
dataframes_list = ['df_departments', 'df_categories', 'df_products', 'df_orders', 'df_order_items'] | |
#The list of all the tables from which the data has to be loaded from | |
sql_tables = ['departments', 'categories', 'products', 'orders', 'order_items'] | |
for i in range (0,5): | |
dataframes_list[i] = sqlContext.read.format('jdbc').options(url=jdbcUrl, user='user', password='password',dbtable=sql_tables[i]).load() | |
dataframes_list[i].registerTempTable(sql_tables[i]) | |
#Executing the query which returns the revenue generated by each department in a month | |
agg_results = sqlContext.sql("SELECT substr(o.order_date, 1,7) order_month, d.department_name, sum(oi.order_item_subtotal) FROM departments d join categories c ON d.department_id = c.category_department_id JOIN products p ON c.category_id = p.product_category_id JOIN order_items oi ON p.product_id = oi.order_item_product_id JOIN orders o ON o.order_id = oi.order_item_order_id GROUP BY substr(o.order_date, 1,7), d.department_name ORDER BY order_month, department_name") | |
for res in agg_results.collect(): | |
print(res) | |
#Sample results: | |
''' | |
Row(order_month=u'2013-07', department_name=u'Golf', _c2=101087.26000000005) | |
Row(order_month=u'2013-07', department_name=u'Outdoors', _c2=20877.519999999993) | |
Row(order_month=u'2013-07', department_name=u'Testing Merge', _c2=5622.149999999998) | |
Row(order_month=u'2013-07', department_name=u'data', _c2=163108.2700000002) | |
Row(order_month=u'2013-07', department_name=u'fanshoppy', _c2=389162.09) | |
Row(order_month=u'2013-07', department_name=u'footwear', _c2=84624.92000000001) | |
Row(order_month=u'2013-08', department_name=u'Golf', _c2=391535.7299999997) | |
Row(order_month=u'2013-08', department_name=u'Outdoors', _c2=82849.71999999997) | |
Row(order_month=u'2013-08', department_name=u'Testing Merge', _c2=23719.610000000008) | |
Row(order_month=u'2013-08', department_name=u'data', _c2=598981.2099999998) | |
Row(order_month=u'2013-08', department_name=u'fanshoppy', _c2=1410128.6399999997) | |
Row(order_month=u'2013-08', department_name=u'footwear', _c2=321443.78999999986) | |
Row(order_month=u'2013-09', department_name=u'Golf', _c2=388137.44) | |
Row(order_month=u'2013-09', department_name=u'Outdoors', _c2=84080.7) | |
Row(order_month=u'2013-09', department_name=u'Testing Merge', _c2=24377.450000000004) | |
Row(order_month=u'2013-09', department_name=u'data', _c2=615739.5500000002) | |
Row(order_month=u'2013-09', department_name=u'fanshoppy', _c2=1452927.6799999995) | |
Row(order_month=u'2013-09', department_name=u'footwear', _c2=369264.4499999999) | |
#The execution of the query using different engines has taken the following times | |
1. Executing the query using mySQL engine : 0.49 seconds | |
2. Executing the query on hive using Map Reduce engine which exececutes the query by running Map Reduce jobs in background: 52.085 seconds | |
3. Executing the query on hive using Tez as execution engine in background: 25.63 seconds | |
4. Executing the query using SparkSQL by loading the tables as Data Frames, it uses Apache Spark catalyst optimizer for running the query: 4.2 seconds | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment