vkumar8282/texas_pudf_part2_spark.ipynb

## texas_pudf_part2_spark.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To import the necessary pyspark components type the following: "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql import SparkSession, SQLContext"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Next, we create our Spark session."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "spark = SparkSession \\\n",
    "    .builder \\\n",
    "    .appName('Import Texas PUDF Data') \\\n",
    "    .getOrCreate() \\"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now, we import the data into a DataFrame:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_sp = spark.read.load(\n",
    "    \"C:\\\\Users\\\\Vikas\\\\PUDF_base1_1q2012_tab.txt\",\n",
    "    format=\"csv\",\n",
    "    sep=\"\\t\",\n",
    "    header=True\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let us now show the first two columns of the first five rows:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------+---------+\n",
      "|RECORD_ID   |DISCHARGE|\n",
      "+------------+---------+\n",
      "|120120107482|2012Q1   |\n",
      "|120120107487|2012Q1   |\n",
      "|120120107568|2012Q1   |\n",
      "|120120107381|2012Q1   |\n",
      "|120120107410|2012Q1   |\n",
      "+------------+---------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "data_sp.select('RECORD_ID','DISCHARGE').show(5, truncate=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's now measure how long it takes to import the data via Pandas."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "35.551525896599244"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import timeit\n",
    "\n",
    "stmt_pandas = 'import pandas as pd; ' + \\\n",
    "    'pd.read_csv(\"C:\\\\\\\\Users\\\\\\\\Vikas\\\\\\\\PUDF_base1_1q2012_tab.txt\",' + \\\n",
    "        'delimiter=\"\\t\",' + \\\n",
    "        'header=0,' + \\\n",
    "        'dtype=\"str\"' + \\\n",
    "    ')'\n",
    "\n",
    "timeit.timeit(stmt=stmt_pandas, number = 1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It takes 35 seconds in Pandas (on my laptop).  \n",
    "\n",
    "We now benchmark the data import in Spark:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.9205595757441074"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stmt_spark = 'from pyspark.sql import SparkSession; ' + \\\n",
    "    'spark = SparkSession.builder.appName(\"test\").getOrCreate();' + \\\n",
    "    'spark.read.load(\"C:\\\\\\\\Users\\\\\\\\Vikas\\\\\\\\PUDF_base1_1q2012_tab.txt\",' + \\\n",
    "        'format=\"csv\",' + \\\n",
    "        'sep=\"\\t\",' + \\\n",
    "        'header=True' + \\\n",
    "    ')'\n",
    "\n",
    "timeit.timeit(stmt=stmt_spark, number = 1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It takes ~ 2 seconds using Spark."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"To import the necessary pyspark components type the following: "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"from pyspark.sql import SparkSession, SQLContext"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Next, we create our Spark session."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"spark = SparkSession \\\n",
	" .builder \\\n",
	" .appName('Import Texas PUDF Data') \\\n",
	" .getOrCreate() \\"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Now, we import the data into a DataFrame:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"data_sp = spark.read.load(\n",
	" \"C:\\\\Users\\\\Vikas\\\\PUDF_base1_1q2012_tab.txt\",\n",
	" format=\"csv\",\n",
	" sep=\"\\t\",\n",
	" header=True\n",
	")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Let us now show the first two columns of the first five rows:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"+------------+---------+\n",
	"\|RECORD_ID \|DISCHARGE\|\n",
	"+------------+---------+\n",
	"\|120120107482\|2012Q1 \|\n",
	"\|120120107487\|2012Q1 \|\n",
	"\|120120107568\|2012Q1 \|\n",
	"\|120120107381\|2012Q1 \|\n",
	"\|120120107410\|2012Q1 \|\n",
	"+------------+---------+\n",
	"only showing top 5 rows\n",
	"\n"
	]
	}
	],
	"source": [
	"data_sp.select('RECORD_ID','DISCHARGE').show(5, truncate=False)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Let's now measure how long it takes to import the data via Pandas."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"35.551525896599244"
	]
	},
	"execution_count": 12,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"import pandas as pd\n",
	"import timeit\n",
	"\n",
	"stmt_pandas = 'import pandas as pd; ' + \\\n",
	" 'pd.read_csv(\"C:\\\\\\\\Users\\\\\\\\Vikas\\\\\\\\PUDF_base1_1q2012_tab.txt\",' + \\\n",
	" 'delimiter=\"\\t\",' + \\\n",
	" 'header=0,' + \\\n",
	" 'dtype=\"str\"' + \\\n",
	" ')'\n",
	"\n",
	"timeit.timeit(stmt=stmt_pandas, number = 1)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"It takes 35 seconds in Pandas (on my laptop). \n",
	"\n",
	"We now benchmark the data import in Spark:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"1.9205595757441074"
	]
	},
	"execution_count": 14,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"stmt_spark = 'from pyspark.sql import SparkSession; ' + \\\n",
	" 'spark = SparkSession.builder.appName(\"test\").getOrCreate();' + \\\n",
	" 'spark.read.load(\"C:\\\\\\\\Users\\\\\\\\Vikas\\\\\\\\PUDF_base1_1q2012_tab.txt\",' + \\\n",
	" 'format=\"csv\",' + \\\n",
	" 'sep=\"\\t\",' + \\\n",
	" 'header=True' + \\\n",
	" ')'\n",
	"\n",
	"timeit.timeit(stmt=stmt_spark, number = 1)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"It takes ~ 2 seconds using Spark."
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.5"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}