Last active
February 3, 2019 19:52
-
-
Save vkumar8282/9f3e04eaff116987fb44eb166000ba62 to your computer and use it in GitHub Desktop.
Setting up Spark for Jupyter notebook and benchmarking its performance
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"To import the necessary pyspark components type the following: " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from pyspark.sql import SparkSession, SQLContext" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Next, we create our Spark session." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"spark = SparkSession \\\n", | |
" .builder \\\n", | |
" .appName('Import Texas PUDF Data') \\\n", | |
" .getOrCreate() \\" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Now, we import the data into a DataFrame:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"data_sp = spark.read.load(\n", | |
" \"C:\\\\Users\\\\Vikas\\\\PUDF_base1_1q2012_tab.txt\",\n", | |
" format=\"csv\",\n", | |
" sep=\"\\t\",\n", | |
" header=True\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Let us now show the first two columns of the first five rows:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+------------+---------+\n", | |
"|RECORD_ID |DISCHARGE|\n", | |
"+------------+---------+\n", | |
"|120120107482|2012Q1 |\n", | |
"|120120107487|2012Q1 |\n", | |
"|120120107568|2012Q1 |\n", | |
"|120120107381|2012Q1 |\n", | |
"|120120107410|2012Q1 |\n", | |
"+------------+---------+\n", | |
"only showing top 5 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"data_sp.select('RECORD_ID','DISCHARGE').show(5, truncate=False)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Let's now measure how long it takes to import the data via Pandas." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"35.551525896599244" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import pandas as pd\n", | |
"import timeit\n", | |
"\n", | |
"stmt_pandas = 'import pandas as pd; ' + \\\n", | |
" 'pd.read_csv(\"C:\\\\\\\\Users\\\\\\\\Vikas\\\\\\\\PUDF_base1_1q2012_tab.txt\",' + \\\n", | |
" 'delimiter=\"\\t\",' + \\\n", | |
" 'header=0,' + \\\n", | |
" 'dtype=\"str\"' + \\\n", | |
" ')'\n", | |
"\n", | |
"timeit.timeit(stmt=stmt_pandas, number = 1)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"It takes 35 seconds in Pandas (on my laptop). \n", | |
"\n", | |
"We now benchmark the data import in Spark:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1.9205595757441074" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"stmt_spark = 'from pyspark.sql import SparkSession; ' + \\\n", | |
" 'spark = SparkSession.builder.appName(\"test\").getOrCreate();' + \\\n", | |
" 'spark.read.load(\"C:\\\\\\\\Users\\\\\\\\Vikas\\\\\\\\PUDF_base1_1q2012_tab.txt\",' + \\\n", | |
" 'format=\"csv\",' + \\\n", | |
" 'sep=\"\\t\",' + \\\n", | |
" 'header=True' + \\\n", | |
" ')'\n", | |
"\n", | |
"timeit.timeit(stmt=stmt_spark, number = 1)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"It takes ~ 2 seconds using Spark." | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment