Skip to content

Instantly share code, notes, and snippets.

@tsailiming
Last active July 19, 2022 22:33
Show Gist options
  • Save tsailiming/c1654e73f30ef34baf02 to your computer and use it in GitHub Desktop.
Save tsailiming/c1654e73f30ef34baf02 to your computer and use it in GitHub Desktop.
Jupyter kernel.json configuration for pyspark
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"There are a few ways to use Jupyter notebook with PySpark\n",
"\n",
"1. Invoking pyspark which will add a Spark and Hive Context automatically\n",
"1. Hardcode SPARK_HOME and PYTHONPATH but use your own context in the notebook (my preferred method)\n",
"1. Define everything in the notebook (UNTESTED!)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 1\n",
"\n",
"````\n",
"{\n",
" \"display_name\": \"PySpark (Python X.Y.Z)\",\n",
" \"language\": \"python\",\n",
" \"argv\": [\n",
" \"/path/to/python\",\n",
" \"-m\",\n",
" \"IPython.kernel\",\n",
" \"-f\",\n",
" \"{connection_file}\"\n",
" ],\n",
" \"env\": {\n",
" \"SPARK_HOME\": \"<spark_dir>\",\n",
" \"PYTHONPATH\": \"<spark_dir>/python/:<spark_dir>/python/lib/py4j-<version>-src.zip\",\n",
" \"PYTHONSTARTUP\": \"<spark_dir>/python/pyspark/shell.py\",\n",
" \"PYSPARK_SUBMIT_ARGS\": \"--master local[2] pyspark-shell\"\n",
" }\n",
"}\n",
"````"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"# 2\n",
"\n",
"* The following example has SPARK_HOME and PYTHONPATH hardcoded\n",
"* Change the path to point to the correct python in `argv`\n",
"````\n",
"{\n",
" \"display_name\": \"PySpark (Spark 1.6.0)\",\n",
" \"language\": \"python\",\n",
" \"argv\": [\n",
" \"/Users/ltsai/Documents/workspace/venv/smu/bin/python\",\n",
" \"-m\",\n",
" \"IPython.kernel\",\n",
" \"-f\",\n",
" \"{connection_file}\"\n",
" ],\n",
" \"env\": {\n",
" \"SPARK_HOME\": \"/Users/ltsai/Documents/workspace/spark-1.6.0-bin-hadoop2.6\",\n",
" \"PYTHONPATH\": \"/Users/ltsai/Documents/workspace/spark-1.6.0-bin-hadoop2.6/python:/Users/ltsai/Documents/workspace/spark-1.6.0-bin-hadoop2.6/python/lib/py4j-0.9-src.zip\"\n",
" }\n",
"}\n",
"````\n",
"\n",
"````\n",
"# PYSPARK_SUBMIT_ARGS can be left out if not providing any maven packagesb\n",
"import os\n",
"os.environ['PYSPARK_SUBMIT_ARGS'] = \"--packages com.databricks:spark-csv_2.10:1.3.0 pyspark-shell\"\n",
"\n",
"from pyspark import SparkContext, SparkConf\n",
"from pyspark.sql import SQLContext, HiveContext\n",
"\n",
"import py4j\n",
"\n",
"conf = SparkConf().setAppName(\"SparkJupyter\") \\\n",
" .setMaster(\"local[2]\") \n",
"# conf = SparkConf().setAppName(\"SparkJupyter\") \\\n",
"# .setMaster(\"yarn-client\") \\\n",
"# .set(\"spark.executor.memory\", \"512m\") \\\n",
"# .set(\"spark.executor.cores\", 1) \\\n",
"# .set(\"spark.executor.instances\", 2)\n",
"sc = SparkContext(conf=conf)\n",
"\n",
"try:\n",
" # Try to access HiveConf, it will raise exception if Hive is not added\n",
" sc._jvm.org.apache.hadoop.hive.conf.HiveConf()\n",
" sqlContext = HiveContext(sc)\n",
"except py4j.protocol.Py4JError:\n",
" sqlContext = SQLContext(sc)\n",
"except TypeError:\n",
" sqlContext = SQLContext(sc)\n",
"sc\n",
"````"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 3 \n",
"````\n",
"{\n",
" \"display_name\": \"PySpark\",\n",
" \"language\": \"python\",\n",
" \"argv\": [\n",
" \"/path/to/python\",\n",
" \"-m\",\n",
" \"IPython.kernel\",\n",
" \"-f\",\n",
" \"{connection_file}\"\n",
" ]\n",
"}\n",
"````\n",
"\n",
"````\n",
"# PYSPARK_SUBMIT_ARGS can be left out if not providing any maven packagesb\n",
"import os\n",
"import sys\n",
"os.environ['SPARK_HOME'] = '/path/to/spark'\n",
"\n",
"sys.path.insert(os.path.join(os.environ['SPARK_HOME'], '/python'))\n",
"sys.path.insert(os.path.join(os.environ['SPARK_HOME'], '/python/lib/py4j-0.9-src.zip'))\n",
"os.environ['PYSPARK_SUBMIT_ARGS'] = \"--packages com.databricks:spark-csv_2.10:1.3.0 pyspark-shell\"\n",
"\n",
"\n",
"from pyspark import SparkContext, SparkConf\n",
"from pyspark.sql import SQLContext, HiveContext\n",
"\n",
"import py4j\n",
"\n",
"conf = SparkConf().setAppName(\"SparkJupyter\") \\\n",
" .setMaster(\"local[2]\") \n",
"# conf = SparkConf().setAppName(\"SparkJupyter\") \\\n",
"# .setMaster(\"yarn-client\") \\\n",
"# .set(\"spark.executor.memory\", \"512m\") \\\n",
"# .set(\"spark.executor.cores\", 1) \\\n",
"# .set(\"spark.executor.instances\", 2)\n",
"sc = SparkContext(conf=conf)\n",
"\n",
"try:\n",
" # Try to access HiveConf, it will raise exception if Hive is not added\n",
" sc._jvm.org.apache.hadoop.hive.conf.HiveConf()\n",
" sqlContext = HiveContext(sc)\n",
"except py4j.protocol.Py4JError:\n",
" sqlContext = SQLContext(sc)\n",
"except TypeError:\n",
" sqlContext = SQLContext(sc)\n",
"sc\n",
"````"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment