nmvega/INTERACTIVE.PYSPARK.SHELL.README.txt

## INTERACTIVE.PYSPARK.SHELL.README.txt
How to use this code.

NOTE: You can place all of the following in your .profile (or shell equivalent).

(1) linux$ export MASTER='yarn-client | local[NN] | spark://host:port'
(2) linux$ export SPARK_HOME=/usr/lib/spark  # Your's will vary.
(3) linux$ export JAVA_HOME=/usr/java/latest # Your's will vary.
(4) linux$ export NAMENODE='vps00' # Your's will vary.
(5) linux$ export PYSTART=${PYTHONSTARTUP}   # See in-line commends about the reason for the need for this alias to PYTHONSTARTUP.
(6) linux$ export HADOOP_CONF_DIR=/etc/hadoop/conf # Your's will vary. This one may not be necessary to set. Try and see.
(7) linux$ export HADOOP_HOME=/usr/lib/hadoop      # Your's will vary. This one may not be necessary to set. Try and see.

(8) bpython -i /path/to/script/below  # The moment of truth. Note that this is 'bpython' (not just plain 'python', which would not give the code completion you desire).
>>> sc
<pyspark.context.SparkContext object at 0x2798110>
>>>

## pythonstartup.for.spark.shell.py
#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# ===========================================================================
# Author: Noel Milton Vega (PRISMALYTICS, LLC.)
# ===========================================================================
# Start-up script for 'python(1)', 'bpython(1)', and Python IDE iterpreters
# when you want a 'client-mode' SPARK Shell (i.e. interactive SPARK shell)
# environment either LOCALLY, on a SPARK Standalone Cluster, or on SPARK
# YARN cluster. The code-sense/intelligence of bpython(1) and IDEs, in
# particular will aid in learning the SPARK core API.
#
# This script basically (1) first sets up an environment to launch a SPARK
# Shell, then (2) launches the SPARK Shell using the 'shell.py' python script
# provided in the distribution's SPARK_HOME; and finally (3) imports our
# favorite Python modules (for convenience; e.g. numpy, scipy; etc.).
#
# IMPORTANT:
#  DON'T RUN THIS SCRIPT DIRECTLY. It is meant to be read in by interpreters
#  (similar, in that respect, to a PYTHONSTARTUP script).
#
#  Thus, there are two ways to use this file:
#  # We can't refer to PYTHONSTARTUP inside this file b/c that causes a recursion loop
#  # when calling this from within IDEs. So in step (0) we alias PYTHONSTARTUP to
#  # PYSTARTUP at the O/S level, and use that alias here (since no conflict with that).
#  (0): user$ export PYSTARTUP=${PYTHONSTARTUP} # We can't use PYTHONSTARTUP in this file
#  (1): user$ export MASTER='yarn-client | local[NN] | spark://host:port'
#       user$ bpython|python -i /path/to/this/file
#
#  (2): From within your favorite IDE, specify it as your python startup
#       script. For example, from within a WINGIDE project, set the following
#       variables within a WING Project: 'Project -> Project Properties':
#          'PYTHONSTARTUP=/path/to/this/very/file'
#          'MASTER=yarn-client | local[NN] | spark://host:port'
# ===========================================================================
namenode  = 'vps00' # CHANGE-ME: To the hostname of your HDFS namenode.
# ===========================================================================
import sys, os, glob, subprocess, random
SPARK_HOME = os.getenv('SPARK_HOME')
# ===========================================================================


# =================================================================================
# This functions emulates the action of "source" or '.' that exists in bash(1),
# and can be used to set PYTHON environment variables (in Pythons globals dict).
# =================================================================================
def source(script, update=True):
    proc = subprocess.Popen(". %s; env -0" % script, stdout=subprocess.PIPE, shell=True)
    output = proc.communicate()[0]
    env = dict((line.split("=", 1) for line in output.split('\x00') if line))
    if update: os.environ.update(env)
    return env
# ================================================================================


# ================================================================================
# Here, we get the name of our current SPARK Assembly JAR file name (locally). We
# use that to create a HDFS URL that points to it's location in HDFS when using
# YARN (i.e. when 'export MASTER=yarn-client'; we ignore it otherwise).
# ================================================================================
# Remember to always upload/update your distribution's current SPARK Assembly JAR
# to HDFS like this:
#   $ hdfs dfs -mkdir -p /user/spark/share/lib" # Only necessary to do once!
#   $ hdfs dfs -rm "/user/spark/share/lib/spark-assembly-*.jar" # Remove old version.
#   $ hdfs dfs -put ${SPARK_HOME}/assembly/lib/spark-assembly-[0-9]*.jar /user/spark/share/lib/
# ================================================================================
SPARK_JAR_LOCATION = glob.glob(SPARK_HOME + '/lib/' + 'spark-assembly-[0-9]*.jar')[0].split("/")[-1]
SPARK_JAR_LOCATION = 'hdfs://' + namenode + ':8020/user/spark/share/lib/' + SPARK_JAR_LOCATION
# ================================================================================


# ================================================================================
# Update Pythons globals environment variable dict with necessary environment
# variables that the SPARK Shell will be looking for. Some we set explicitly via
# an in-line dictionary, as shown below. And the rest are set by 'source'ing the
# global SPARK environment file (although we could have included those explicitly
# here too, if we preferred not to touch that system-wide file -- and leave it as FCS).
# ================================================================================
spark_jar_opt = None
MASTER = os.getenv('MASTER') if os.getenv('MASTER') else 'local[8]'
if    MASTER.startswith('yarn-'): spark_jar_opt = ' -Dspark.yarn.jar=' + SPARK_JAR_LOCATION
elif  MASTER.startswith('spark://'): pass
else: HADOOP_HOME = ''
# ================================================================================


# ================================================================================
# Build '--driver-java-options' options for spark-shell, pyspark, or spark-submit.
# Many of these are set in '/etc/spark/conf/spark-defaults.conf' (and thus
# commented out here, but left here for reference completeness).
# ================================================================================
# Default UI port is 4040. The next statement allows us to run multiple SPARK shells.
DRIVER_JAVA_OPTIONS = '-Dspark.ui.port=' + str(random.randint(1025, 65535))
DRIVER_JAVA_OPTIONS += spark_jar_opt if spark_jar_opt else ''
# ================================================================================

# ================================================================================
# Build PYSPARK_SUBMIT_ARGS (i.e. the sames ones shown in 'pyspark --help'), and
# apply them to the O/S environment.
# ================================================================================
DRIVER_JAVA_OPTIONS = "'" + DRIVER_JAVA_OPTIONS + "'"
PYSPARK_SUBMIT_ARGS = ' --master ' + MASTER # Remember to set MASTER on UNIX CLI or in the IDE!
PYSPARK_SUBMIT_ARGS += ' --driver-java-options ' + DRIVER_JAVA_OPTIONS # Built above.
# ================================================================================
os.environ.update(source('/etc/spark/conf/spark-env.sh', update = False))
os.environ.update({ 'PYSPARK_SUBMIT_ARGS' : PYSPARK_SUBMIT_ARGS })
# ================================================================================


# ================================================================================
# Next, adjust 'sys.path' so SPARK Shell has the python modules it needs.
# ================================================================================
SPARK_PYTHON_DIR = SPARK_HOME + '/python'
PY4J = glob.glob(SPARK_PYTHON_DIR + '/lib/' + 'py4j-*-src.zip')[0].split("/")[-1]
sys.path = [SPARK_PYTHON_DIR, SPARK_PYTHON_DIR + '/lib/' + PY4J] + sys.path
# ================================================================================


# ================================================================================
# With our environment set, we start the SPARK Shell; and then to that, we add
# our favorite Python imports (e.g. numpy, scipy; etc).
# ================================================================================
print('PYSPARK_SUBMIT_ARGS:' + PYSPARK_SUBMIT_ARGS) # For visual debug.
execfile(SPARK_HOME + '/python/pyspark/shell.py', globals()) # Start the SPARK Shell.
execfile(os.getenv('PYSTARTUP')) # Next, load our favorite Python modules.
# ================================================================================


# ================================================================================
# Finally, import pyspark specific modules so that, once we're dropped into the
# interactive PySpark shell environment, these are already imported. =:). Modify
# as desired. This step couldn't be performed until 'sys.path' modification above.
# ================================================================================
import pyspark
import pyspark.context
from   pyspark.sql import * # SQLContext; ROW; etc.
import pyspark.storagelevel
import pyspark.serializers
# ================================================================================
	How to use this code.

	NOTE: You can place all of the following in your .profile (or shell equivalent).

	(1) linux$ export MASTER='yarn-client \| local[NN] \| spark://host:port'
	(2) linux$ export SPARK_HOME=/usr/lib/spark # Your's will vary.
	(3) linux$ export JAVA_HOME=/usr/java/latest # Your's will vary.
	(4) linux$ export NAMENODE='vps00' # Your's will vary.
	(5) linux$ export PYSTART=${PYTHONSTARTUP} # See in-line commends about the reason for the need for this alias to PYTHONSTARTUP.
	(6) linux$ export HADOOP_CONF_DIR=/etc/hadoop/conf # Your's will vary. This one may not be necessary to set. Try and see.
	(7) linux$ export HADOOP_HOME=/usr/lib/hadoop # Your's will vary. This one may not be necessary to set. Try and see.

	(8) bpython -i /path/to/script/below # The moment of truth. Note that this is 'bpython' (not just plain 'python', which would not give the code completion you desire).
	>>> sc
	<pyspark.context.SparkContext object at 0x2798110>
	>>>
	#! /usr/bin/env python
	# -- coding: utf-8 --
	#
	# ===========================================================================
	# Author: Noel Milton Vega (PRISMALYTICS, LLC.)
	# ===========================================================================
	# Start-up script for 'python(1)', 'bpython(1)', and Python IDE iterpreters
	# when you want a 'client-mode' SPARK Shell (i.e. interactive SPARK shell)
	# environment either LOCALLY, on a SPARK Standalone Cluster, or on SPARK
	# YARN cluster. The code-sense/intelligence of bpython(1) and IDEs, in
	# particular will aid in learning the SPARK core API.
	#
	# This script basically (1) first sets up an environment to launch a SPARK
	# Shell, then (2) launches the SPARK Shell using the 'shell.py' python script
	# provided in the distribution's SPARK_HOME; and finally (3) imports our
	# favorite Python modules (for convenience; e.g. numpy, scipy; etc.).
	#
	# IMPORTANT:
	# DON'T RUN THIS SCRIPT DIRECTLY. It is meant to be read in by interpreters
	# (similar, in that respect, to a PYTHONSTARTUP script).
	#
	# Thus, there are two ways to use this file:
	# # We can't refer to PYTHONSTARTUP inside this file b/c that causes a recursion loop
	# # when calling this from within IDEs. So in step (0) we alias PYTHONSTARTUP to
	# # PYSTARTUP at the O/S level, and use that alias here (since no conflict with that).
	# (0): user$ export PYSTARTUP=${PYTHONSTARTUP} # We can't use PYTHONSTARTUP in this file
	# (1): user$ export MASTER='yarn-client \| local[NN] \| spark://host:port'
	# user$ bpython\|python -i /path/to/this/file
	#
	# (2): From within your favorite IDE, specify it as your python startup
	# script. For example, from within a WINGIDE project, set the following
	# variables within a WING Project: 'Project -> Project Properties':
	# 'PYTHONSTARTUP=/path/to/this/very/file'
	# 'MASTER=yarn-client \| local[NN] \| spark://host:port'
	# ===========================================================================
	namenode = 'vps00' # CHANGE-ME: To the hostname of your HDFS namenode.
	# ===========================================================================
	import sys, os, glob, subprocess, random
	SPARK_HOME = os.getenv('SPARK_HOME')
	# ===========================================================================


	# =================================================================================
	# This functions emulates the action of "source" or '.' that exists in bash(1),
	# and can be used to set PYTHON environment variables (in Pythons globals dict).
	# =================================================================================
	def source(script, update=True):
	proc = subprocess.Popen(". %s; env -0" % script, stdout=subprocess.PIPE, shell=True)
	output = proc.communicate()[0]
	env = dict((line.split("=", 1) for line in output.split('\x00') if line))
	if update: os.environ.update(env)
	return env
	# ================================================================================


	# ================================================================================
	# Here, we get the name of our current SPARK Assembly JAR file name (locally). We
	# use that to create a HDFS URL that points to it's location in HDFS when using
	# YARN (i.e. when 'export MASTER=yarn-client'; we ignore it otherwise).
	# ================================================================================
	# Remember to always upload/update your distribution's current SPARK Assembly JAR
	# to HDFS like this:
	# $ hdfs dfs -mkdir -p /user/spark/share/lib" # Only necessary to do once!
	# $ hdfs dfs -rm "/user/spark/share/lib/spark-assembly-*.jar" # Remove old version.
	# $ hdfs dfs -put ${SPARK_HOME}/assembly/lib/spark-assembly-[0-9]*.jar /user/spark/share/lib/
	# ================================================================================
	SPARK_JAR_LOCATION = glob.glob(SPARK_HOME + '/lib/' + 'spark-assembly-[0-9]*.jar')[0].split("/")[-1]
	SPARK_JAR_LOCATION = 'hdfs://' + namenode + ':8020/user/spark/share/lib/' + SPARK_JAR_LOCATION
	# ================================================================================


	# ================================================================================
	# Update Pythons globals environment variable dict with necessary environment
	# variables that the SPARK Shell will be looking for. Some we set explicitly via
	# an in-line dictionary, as shown below. And the rest are set by 'source'ing the
	# global SPARK environment file (although we could have included those explicitly
	# here too, if we preferred not to touch that system-wide file -- and leave it as FCS).
	# ================================================================================
	spark_jar_opt = None
	MASTER = os.getenv('MASTER') if os.getenv('MASTER') else 'local[8]'
	if MASTER.startswith('yarn-'): spark_jar_opt = ' -Dspark.yarn.jar=' + SPARK_JAR_LOCATION
	elif MASTER.startswith('spark://'): pass
	else: HADOOP_HOME = ''
	# ================================================================================


	# ================================================================================
	# Build '--driver-java-options' options for spark-shell, pyspark, or spark-submit.
	# Many of these are set in '/etc/spark/conf/spark-defaults.conf' (and thus
	# commented out here, but left here for reference completeness).
	# ================================================================================
	# Default UI port is 4040. The next statement allows us to run multiple SPARK shells.
	DRIVER_JAVA_OPTIONS = '-Dspark.ui.port=' + str(random.randint(1025, 65535))
	DRIVER_JAVA_OPTIONS += spark_jar_opt if spark_jar_opt else ''
	# ================================================================================

	# ================================================================================
	# Build PYSPARK_SUBMIT_ARGS (i.e. the sames ones shown in 'pyspark --help'), and
	# apply them to the O/S environment.
	# ================================================================================
	DRIVER_JAVA_OPTIONS = "'" + DRIVER_JAVA_OPTIONS + "'"
	PYSPARK_SUBMIT_ARGS = ' --master ' + MASTER # Remember to set MASTER on UNIX CLI or in the IDE!
	PYSPARK_SUBMIT_ARGS += ' --driver-java-options ' + DRIVER_JAVA_OPTIONS # Built above.
	# ================================================================================
	os.environ.update(source('/etc/spark/conf/spark-env.sh', update = False))
	os.environ.update({ 'PYSPARK_SUBMIT_ARGS' : PYSPARK_SUBMIT_ARGS })
	# ================================================================================


	# ================================================================================
	# Next, adjust 'sys.path' so SPARK Shell has the python modules it needs.
	# ================================================================================
	SPARK_PYTHON_DIR = SPARK_HOME + '/python'
	PY4J = glob.glob(SPARK_PYTHON_DIR + '/lib/' + 'py4j-*-src.zip')[0].split("/")[-1]
	sys.path = [SPARK_PYTHON_DIR, SPARK_PYTHON_DIR + '/lib/' + PY4J] + sys.path
	# ================================================================================


	# ================================================================================
	# With our environment set, we start the SPARK Shell; and then to that, we add
	# our favorite Python imports (e.g. numpy, scipy; etc).
	# ================================================================================
	print('PYSPARK_SUBMIT_ARGS:' + PYSPARK_SUBMIT_ARGS) # For visual debug.
	execfile(SPARK_HOME + '/python/pyspark/shell.py', globals()) # Start the SPARK Shell.
	execfile(os.getenv('PYSTARTUP')) # Next, load our favorite Python modules.
	# ================================================================================


	# ================================================================================
	# Finally, import pyspark specific modules so that, once we're dropped into the
	# interactive PySpark shell environment, these are already imported. =:). Modify
	# as desired. This step couldn't be performed until 'sys.path' modification above.
	# ================================================================================
	import pyspark
	import pyspark.context
	from pyspark.sql import * # SQLContext; ROW; etc.
	import pyspark.storagelevel
	import pyspark.serializers
	# ================================================================================