parente/spark-csv.ipynb

## spark-csv.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Use spark-csv from Jupyter Notebook\n",
    "\n",
    "Tested against the `jupyter/pyspark-notebook:2d125a7161b5` Docker image. Same trick should work for other languages with minor changes.\n",
    "\n",
    "Note: This will **not** work on try.jupyter.org / tmpnb.org which disallows outgoing network connections. (The Spark driver needs to fetch the spark-csv package!)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Get a sample CSV to work with."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "converted 'https://raw.githubusercontent.com/jupyter/docker-demo-images/master/datasets/datasets/cars.csv' (ANSI_X3.4-1968) -> 'https://raw.githubusercontent.com/jupyter/docker-demo-images/master/datasets/datasets/cars.csv' (UTF-8)\n",
      "--2016-01-05 04:24:19--  https://raw.githubusercontent.com/jupyter/docker-demo-images/master/datasets/datasets/cars.csv\n",
      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 23.235.46.133\n",
      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|23.235.46.133|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 552 [text/plain]\n",
      "Saving to: 'cars.csv.1'\n",
      "\n",
      "cars.csv.1          100%[=====================>]     552  --.-KB/s   in 0s     \n",
      "\n",
      "2016-01-05 04:24:20 (7.67 MB/s) - 'cars.csv.1' saved [552/552]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!wget https://raw.githubusercontent.com/jupyter/docker-demo-images/master/datasets/datasets/cars.csv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Configure the pyspark submit args to include the spark-csv **before** creating the `SparkContext`. Don't forget the `pyspark-shell` arg at the end for Spark 1.4+."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-csv_2.10:1.3.0 pyspark-shell'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now create the context and try to read a CSV."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pyspark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "sc = pyspark.SparkContext()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pyspark.sql import SQLContext"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "sqlContext = SQLContext(sc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/spark/python/pyspark/sql/context.py:507: UserWarning: load is deprecated. Use read.load() instead.\n",
      "  warnings.warn(\"load is deprecated. Use read.load() instead.\")\n"
     ]
    }
   ],
   "source": [
    "df = sqlContext.load(source=\"com.databricks.spark.csv\", header='true', inferSchema='true', path='cars.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+-----+----+\n",
      "|   |speed|dist|\n",
      "+---+-----+----+\n",
      "|  1|    4|   2|\n",
      "|  2|    4|  10|\n",
      "|  3|    7|   4|\n",
      "|  4|    7|  22|\n",
      "|  5|    8|  16|\n",
      "|  6|    9|  10|\n",
      "|  7|   10|  18|\n",
      "|  8|   10|  26|\n",
      "|  9|   10|  34|\n",
      "| 10|   11|  17|\n",
      "| 11|   11|  28|\n",
      "| 12|   12|  14|\n",
      "| 13|   12|  20|\n",
      "| 14|   12|  24|\n",
      "| 15|   12|  28|\n",
      "| 16|   13|  26|\n",
      "| 17|   13|  34|\n",
      "| 18|   13|  34|\n",
      "| 19|   13|  46|\n",
      "| 20|   14|  26|\n",
      "+---+-----+----+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Use spark-csv from Jupyter Notebook\n",
	"\n",
	"Tested against the `jupyter/pyspark-notebook:2d125a7161b5` Docker image. Same trick should work for other languages with minor changes.\n",
	"\n",
	"Note: This will not work on try.jupyter.org / tmpnb.org which disallows outgoing network connections. (The Spark driver needs to fetch the spark-csv package!)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Get a sample CSV to work with."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"converted 'https://raw.githubusercontent.com/jupyter/docker-demo-images/master/datasets/datasets/cars.csv' (ANSI_X3.4-1968) -> 'https://raw.githubusercontent.com/jupyter/docker-demo-images/master/datasets/datasets/cars.csv' (UTF-8)\n",
	"--2016-01-05 04:24:19-- https://raw.githubusercontent.com/jupyter/docker-demo-images/master/datasets/datasets/cars.csv\n",
	"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 23.235.46.133\n",
	"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)\|23.235.46.133\|:443... connected.\n",
	"HTTP request sent, awaiting response... 200 OK\n",
	"Length: 552 [text/plain]\n",
	"Saving to: 'cars.csv.1'\n",
	"\n",
	"cars.csv.1 100%[=====================>] 552 --.-KB/s in 0s \n",
	"\n",
	"2016-01-05 04:24:20 (7.67 MB/s) - 'cars.csv.1' saved [552/552]\n",
	"\n"
	]
	}
	],
	"source": [
	"!wget https://raw.githubusercontent.com/jupyter/docker-demo-images/master/datasets/datasets/cars.csv"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Configure the pyspark submit args to include the spark-csv before creating the `SparkContext`. Don't forget the `pyspark-shell` arg at the end for Spark 1.4+."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import os"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-csv_2.10:1.3.0 pyspark-shell'"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Now create the context and try to read a CSV."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import pyspark"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"sc = pyspark.SparkContext()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from pyspark.sql import SQLContext"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"sqlContext = SQLContext(sc)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/usr/local/spark/python/pyspark/sql/context.py:507: UserWarning: load is deprecated. Use read.load() instead.\n",
	" warnings.warn(\"load is deprecated. Use read.load() instead.\")\n"
	]
	}
	],
	"source": [
	"df = sqlContext.load(source=\"com.databricks.spark.csv\", header='true', inferSchema='true', path='cars.csv')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"+---+-----+----+\n",
	"\| \|speed\|dist\|\n",
	"+---+-----+----+\n",
	"\| 1\| 4\| 2\|\n",
	"\| 2\| 4\| 10\|\n",
	"\| 3\| 7\| 4\|\n",
	"\| 4\| 7\| 22\|\n",
	"\| 5\| 8\| 16\|\n",
	"\| 6\| 9\| 10\|\n",
	"\| 7\| 10\| 18\|\n",
	"\| 8\| 10\| 26\|\n",
	"\| 9\| 10\| 34\|\n",
	"\| 10\| 11\| 17\|\n",
	"\| 11\| 11\| 28\|\n",
	"\| 12\| 12\| 14\|\n",
	"\| 13\| 12\| 20\|\n",
	"\| 14\| 12\| 24\|\n",
	"\| 15\| 12\| 28\|\n",
	"\| 16\| 13\| 26\|\n",
	"\| 17\| 13\| 34\|\n",
	"\| 18\| 13\| 34\|\n",
	"\| 19\| 13\| 46\|\n",
	"\| 20\| 14\| 26\|\n",
	"+---+-----+----+\n",
	"only showing top 20 rows\n",
	"\n"
	]
	}
	],
	"source": [
	"df.show()"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.4.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}