ahmaurya/aws_emr_ipython_pyspark_setup_code.py

## aws_emr_ipython_pyspark_setup_code.py
# Step I: Start an AWS EMR cluster. Choose Spark in the software to be installed and provide s3://elasticmapreduce.bootstrapactions/ipython-notebook/install-ipython-notebook as the bootstrap action

# Step II: Login onto the master node of the cluster and execute "sudo pip install jupyter databricks_test_helper matplotlib seaborn"

# Step III: Execute the command "jupyter notebook list" which should show an IPython notebook URL which ends in something like "?token=e52347f8d93d122519e4b9d8df7e34b38d7074f76539e225". Copy just its token which should look something like "e52347f8d93d122519e4b9d8df7e34b38d7074f76539e225". This will help you login on the notebook server.

# Step IV: Execute the command: "ssh -o ServerAliveInterval=10 -i amazon-key-pair.pem -N -L 8192:ec2-52-86-172-186.compute-1.amazonaws.com:8192 hadoop@ec2-52-86-172-186.compute-1.amazonaws.com" on your local machine such as your laptop. This command opens an ssh tunnel from your machine to the AWS cluster master on port 8192. amazon-key-pair.pem is your keypair file and ec2-52-86-172-186.compute-1.amazonaws.com is an example of AWS cluster's master node address.

# Step V: Open http://localhost:8192 in a browser on your local machine such as your laptop. Due to the SSH tunneling performed earlier, you are able to talk to the notebook server running on the AWS cluster from your browser. On your first login, it will ask for a token to authorize you. Provide the token you had saved earlier from running "jupyter notebook list" on the cluster.

# Step VI: Execute the code below. It sets up the Python path so you have access to any Python packages you install on the AWS cluster. It also sets up the SparkContext object "sc" and makes it available to this notebook by executing "python/pyspark/shell.py" located in SPARK_HOME

import os
import sys
import os.path

sys.path.insert(0, '/usr/local/lib/python2.7/site-packages')

os.environ['SPARK_HOME'] = '/usr/lib/spark'
spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
      raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-src.zip'))
if 'sc' not in vars() and 'sc' not in globals():
    execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))
	# Step I: Start an AWS EMR cluster. Choose Spark in the software to be installed and provide s3://elasticmapreduce.bootstrapactions/ipython-notebook/install-ipython-notebook as the bootstrap action

	# Step II: Login onto the master node of the cluster and execute "sudo pip install jupyter databricks_test_helper matplotlib seaborn"

	# Step III: Execute the command "jupyter notebook list" which should show an IPython notebook URL which ends in something like "?token=e52347f8d93d122519e4b9d8df7e34b38d7074f76539e225". Copy just its token which should look something like "e52347f8d93d122519e4b9d8df7e34b38d7074f76539e225". This will help you login on the notebook server.

	# Step IV: Execute the command: "ssh -o ServerAliveInterval=10 -i amazon-key-pair.pem -N -L 8192:ec2-52-86-172-186.compute-1.amazonaws.com:8192 hadoop@ec2-52-86-172-186.compute-1.amazonaws.com" on your local machine such as your laptop. This command opens an ssh tunnel from your machine to the AWS cluster master on port 8192. amazon-key-pair.pem is your keypair file and ec2-52-86-172-186.compute-1.amazonaws.com is an example of AWS cluster's master node address.

	# Step V: Open http://localhost:8192 in a browser on your local machine such as your laptop. Due to the SSH tunneling performed earlier, you are able to talk to the notebook server running on the AWS cluster from your browser. On your first login, it will ask for a token to authorize you. Provide the token you had saved earlier from running "jupyter notebook list" on the cluster.

	# Step VI: Execute the code below. It sets up the Python path so you have access to any Python packages you install on the AWS cluster. It also sets up the SparkContext object "sc" and makes it available to this notebook by executing "python/pyspark/shell.py" located in SPARK_HOME

	import os
	import sys
	import os.path

	sys.path.insert(0, '/usr/local/lib/python2.7/site-packages')

	os.environ['SPARK_HOME'] = '/usr/lib/spark'
	spark_home = os.environ.get('SPARK_HOME', None)
	if not spark_home:
	raise ValueError('SPARK_HOME environment variable is not set')
	sys.path.insert(0, os.path.join(spark_home, 'python'))
	sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-src.zip'))
	if 'sc' not in vars() and 'sc' not in globals():
	execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))