xxxVxxx/gist:3ab2f6662143d971f99694a2251d1e3b

## gistfile1.txt
// get old
s3cmd get s3://elasticmapreduce/samples/spark/0.8.1/spark-0.8.1-emr.tgz
tar -xvf spark-0.8.1-emr.tgz

// get newer
curl "http://d3kbcqa49mib13.cloudfront.net/spark-1.0.0-bin-hadoop1.tgz" -o "spark-1.0.0-bin-hadoop1.tgz" # Grab the latest version of the prebuilt Spark jars. We grabbed Spark 1.0.0 with Hadoop 1 , spark-1.0.0-bin-hadoop1.tgz, from the Spark Downloads page
tar -xvf spark-1.0.0-bin-hadoop1.tgz


// old metrics properties files
cp spark-0.8.1-emr/conf/metrics.properties.aws spark-1.0.0-bin-hadoop1/conf/metrics.properties.aws # AWS requires the metrics.properties.aws file (ours is a copy of the metrics.properties.aws from the spark-0.8.1-emr.tgz posted by amazon at s3://elasticmapreduce/samples/spark/0.8.1/spark-0.8.1-emr.tgz)

// move lib folder to jars folder
mv spark-1.0.0-bin-hadoop1/lib spark-1.0.0-bin-hadoop1/jars # AWS looks for spark jars in the "jars" folder

// sbin folder to bin folder
mv spark-1.0.0-bin-hadoop1/sbin/* spark-1.0.0-bin-hadoop1/bin # AWS looks for scripts in the "bin" folder

// remove sbin and ec2 directories
rmdir spark-1.0.0-bin-hadoop1/sbin # AWS looks for scripts in the "bin" folder
rm -rf spark-1.0.0-bin-hadoop1/ec2 # The "ec2" directory is not needed


// modify spark-1.0.0-bin-hadoop1/compute-classpath.sh: by removing the following lines
###########
# # Build up classpath
# CLASSPATH="$SPARK_CLASSPATH:$SPARK_SUBMIT_CLASSPATH:$FWDIR/conf
#
###########


// modify spark-1.0.0-bin-hadoop1/compute-classpath.sh: add the following lines to include lzo path
// (Based on answer here: http://grokbase.com/t/cloudera/cdh-user/144bv47zb0/cdh5-0-spark-shell-cannot-work-when-enable-lzo-in-core-site-xml)
###########
# # add hadoop home
# export HADOOP_HOME=/home/hadoop/

# # Build up classpath
# CLASSPATH="$SPARK_CLASSPATH:$SPARK_SUBMIT_CLASSPATH:$FWDIR/conf:$HADOOP_HOME/lib/hadoop-lzo.jar"

# export JAVA_LIBRARY_PATH=$JAVA_LIBRARY_PATH:$HADOOP_HOME/native/Linux-i386-32
# export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HADOOP_HOME/native/Linux-i386-32


// add the following lines to spark-1.0.0-bin-hadoop1/bin/spark-class after the section marked # Find the java binary
###########
# # set spark_library_path
# export HADOOP_HOME=/home/hadoop/
# SPARK_LIBRARY_PATH=$SPARK_LIBRARY_PATH:$HADOOP_HOME/native/Linux-i386-32

tar -zcvf spark-1.0.0-bin-hadoop1-emr.tgz spark-1.0.0-bin-hadoop1 # tar up the modified directory
s3cmd put spark-1.0.0-bin-hadoop1-emr.tgz s3://[your-s3-bucket]/spark-1.0.0-bin-hadoop1-emr.tgz # copy the tgz file to your s3 bucket
s3cmd get s3://intentmedia-spark/install-spark-shark.sh install-spark-shark.sh


// modify install-spark-shark.sh to work with your lzo libraries on s3 and your spark jar on s3: remove the following lines:
###########
# hadoop fs -copyToLocal s3://intentmedia-spark/hadoop-lzo.jar /home/hadoop/lib
# wget http://intentmedia-spark.s3.amazonaws.com/spark-1.0.0-bin-hadoop1.tgz
#
###########


// modify install-spark-shark.sh to work with your lzo libraries on s3 and your spark jar on s3: add the following lines
###########
# hadoop fs -copyToLocal s3://[your-s3-bucket]/hadoop-lzo.jar /home/hadoop/lib
# wget http://[your-s3-bucket].s3.amazonaws.com/spark-1.0.0-bin-hadoop1.tgz

s3cmd put install-spark-shark.sh s3://[your-s3-bucket]/install-spark-shark.sh # Copy the install-spark-shark.sh script to your s3 bucket so it can be referenced when invoking the elastic-mapreduce command line command. Note: your s3 bucket may need to be public for your EMR instances to have permission to access it.


###########
# Steps for compiling lzo on AWS EMR machine images (not necessary if you have a hadoop-lzo.jar that works with the EMR AMIs)
###########

# The lzo codec depends on native libraries and the jar needs to be built against them. To do this, one option is to ssh into a running EMR cluster and follow the instructions here: https://github.com/kevinweil/hadoop-lzo#building-and-configuring.
#Specifically, do:
# git clone https://github.com/twitter/hadoop-lzo cd hadoop-lzo mvn clean install

#Then, scp the hadoop-lzo.jar file from the "target" directory to your local machine and copy to [your-s3-bucket], maybe in a directory that reflects the architecture it was compiled for: s3cmd put hadoop-lzo.jar s3://[your-s3-bucket]/bootstrap/lzo/i386/hadoop-lzo.jar
#Note: when attempting to build the hadoop-lzo project you may run into an error with a jersey jar, which can be fixed by adding the following mirror in /etc/maven2/settings.xml:
#<mirror> <id>glassfish-repository</id> <mirrorOf>glassfish-repository</mirrorOf> <name>Repository for Glassfish</name> <url>https://maven.java.net/content/groups/glassfish/</url> </mirror>
#and also deleting the files with the wrong content from "~/.m2/repository/net/java/jvnet-parent", based on the answer here: https://answers.atlassian.com/questions/174059/problems-in-plugin-tutorial
#FYI (no action necessary) lzo-related changes made to the install-spark-shark.sh script were based aws forums thread about installing lzo: https://forums.aws.amazon.com/thread.jspa?threadID=93856
	// get old
	s3cmd get s3://elasticmapreduce/samples/spark/0.8.1/spark-0.8.1-emr.tgz
	tar -xvf spark-0.8.1-emr.tgz

	// get newer
	curl "http://d3kbcqa49mib13.cloudfront.net/spark-1.0.0-bin-hadoop1.tgz" -o "spark-1.0.0-bin-hadoop1.tgz" # Grab the latest version of the prebuilt Spark jars. We grabbed Spark 1.0.0 with Hadoop 1 , spark-1.0.0-bin-hadoop1.tgz, from the Spark Downloads page
	tar -xvf spark-1.0.0-bin-hadoop1.tgz


	// old metrics properties files
	cp spark-0.8.1-emr/conf/metrics.properties.aws spark-1.0.0-bin-hadoop1/conf/metrics.properties.aws # AWS requires the metrics.properties.aws file (ours is a copy of the metrics.properties.aws from the spark-0.8.1-emr.tgz posted by amazon at s3://elasticmapreduce/samples/spark/0.8.1/spark-0.8.1-emr.tgz)

	// move lib folder to jars folder
	mv spark-1.0.0-bin-hadoop1/lib spark-1.0.0-bin-hadoop1/jars # AWS looks for spark jars in the "jars" folder

	// sbin folder to bin folder
	mv spark-1.0.0-bin-hadoop1/sbin/* spark-1.0.0-bin-hadoop1/bin # AWS looks for scripts in the "bin" folder

	// remove sbin and ec2 directories
	rmdir spark-1.0.0-bin-hadoop1/sbin # AWS looks for scripts in the "bin" folder
	rm -rf spark-1.0.0-bin-hadoop1/ec2 # The "ec2" directory is not needed


	// modify spark-1.0.0-bin-hadoop1/compute-classpath.sh: by removing the following lines
	###########
	# # Build up classpath
	# CLASSPATH="$SPARK_CLASSPATH:$SPARK_SUBMIT_CLASSPATH:$FWDIR/conf
	#
	###########


	// modify spark-1.0.0-bin-hadoop1/compute-classpath.sh: add the following lines to include lzo path
	// (Based on answer here: http://grokbase.com/t/cloudera/cdh-user/144bv47zb0/cdh5-0-spark-shell-cannot-work-when-enable-lzo-in-core-site-xml)
	###########
	# # add hadoop home
	# export HADOOP_HOME=/home/hadoop/

	# # Build up classpath
	# CLASSPATH="$SPARK_CLASSPATH:$SPARK_SUBMIT_CLASSPATH:$FWDIR/conf:$HADOOP_HOME/lib/hadoop-lzo.jar"

	# export JAVA_LIBRARY_PATH=$JAVA_LIBRARY_PATH:$HADOOP_HOME/native/Linux-i386-32
	# export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HADOOP_HOME/native/Linux-i386-32



	// add the following lines to spark-1.0.0-bin-hadoop1/bin/spark-class after the section marked # Find the java binary
	###########
	# # set spark_library_path
	# export HADOOP_HOME=/home/hadoop/
	# SPARK_LIBRARY_PATH=$SPARK_LIBRARY_PATH:$HADOOP_HOME/native/Linux-i386-32

	tar -zcvf spark-1.0.0-bin-hadoop1-emr.tgz spark-1.0.0-bin-hadoop1 # tar up the modified directory
	s3cmd put spark-1.0.0-bin-hadoop1-emr.tgz s3://[your-s3-bucket]/spark-1.0.0-bin-hadoop1-emr.tgz # copy the tgz file to your s3 bucket
	s3cmd get s3://intentmedia-spark/install-spark-shark.sh install-spark-shark.sh


	// modify install-spark-shark.sh to work with your lzo libraries on s3 and your spark jar on s3: remove the following lines:
	###########
	# hadoop fs -copyToLocal s3://intentmedia-spark/hadoop-lzo.jar /home/hadoop/lib
	# wget http://intentmedia-spark.s3.amazonaws.com/spark-1.0.0-bin-hadoop1.tgz
	#
	###########


	// modify install-spark-shark.sh to work with your lzo libraries on s3 and your spark jar on s3: add the following lines
	###########
	# hadoop fs -copyToLocal s3://[your-s3-bucket]/hadoop-lzo.jar /home/hadoop/lib
	# wget http://[your-s3-bucket].s3.amazonaws.com/spark-1.0.0-bin-hadoop1.tgz

	s3cmd put install-spark-shark.sh s3://[your-s3-bucket]/install-spark-shark.sh # Copy the install-spark-shark.sh script to your s3 bucket so it can be referenced when invoking the elastic-mapreduce command line command. Note: your s3 bucket may need to be public for your EMR instances to have permission to access it.


	###########
	# Steps for compiling lzo on AWS EMR machine images (not necessary if you have a hadoop-lzo.jar that works with the EMR AMIs)
	###########

	# The lzo codec depends on native libraries and the jar needs to be built against them. To do this, one option is to ssh into a running EMR cluster and follow the instructions here: https://github.com/kevinweil/hadoop-lzo#building-and-configuring.
	#Specifically, do:
	# git clone https://github.com/twitter/hadoop-lzo cd hadoop-lzo mvn clean install

	#Then, scp the hadoop-lzo.jar file from the "target" directory to your local machine and copy to [your-s3-bucket], maybe in a directory that reflects the architecture it was compiled for: s3cmd put hadoop-lzo.jar s3://[your-s3-bucket]/bootstrap/lzo/i386/hadoop-lzo.jar
	#Note: when attempting to build the hadoop-lzo project you may run into an error with a jersey jar, which can be fixed by adding the following mirror in /etc/maven2/settings.xml:
	#<mirror> <id>glassfish-repository</id> <mirrorOf>glassfish-repository</mirrorOf> <name>Repository for Glassfish</name> <url>https://maven.java.net/content/groups/glassfish/</url> </mirror>
	#and also deleting the files with the wrong content from "~/.m2/repository/net/java/jvnet-parent", based on the answer here: https://answers.atlassian.com/questions/174059/problems-in-plugin-tutorial
	#FYI (no action necessary) lzo-related changes made to the install-spark-shark.sh script were based aws forums thread about installing lzo: https://forums.aws.amazon.com/thread.jspa?threadID=93856