Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save eyelove/394298e8d9d374051554 to your computer and use it in GitHub Desktop.
Save eyelove/394298e8d9d374051554 to your computer and use it in GitHub Desktop.
Bootstrap script for installing Apache Zeppelin on an Amazon EMR Cluster. Verfied on Amazon EMR release 4.x.
#!/bin/bash -ex
if [ "$(cat /mnt/var/lib/info/instance.json | jq -r .isMaster)" == "true" ]; then
# Install Git
sudo yum -y install git
# Install Maven
wget -P /tmp http://apache.mirrors.spacedump.net/maven/maven-3/3.3.3/binaries/apache-maven-3.3.3-bin.tar.gz
sudo mkdir /opt/apache-maven
sudo tar -xvzf /tmp/apache-maven-3.3.3-bin.tar.gz -C /opt/apache-maven
cat <<EOF >> /home/hadoop/.bashrc
# Maven
export MAVEN_HOME=/opt/apache-maven/apache-maven-3.3.3
export PATH=\$MAVEN_HOME/bin:\$PATH
EOF
source /home/hadoop/.bashrc
# Install Zeppelin
git clone https://github.com/apache/incubator-zeppelin.git /home/hadoop/zeppelin
cd /home/hadoop/zeppelin
mvn clean package -Pspark-1.4 -Dhadoop.version=2.6.0 -Phadoop-2.6 -Pyarn -DskipTests
# Configure Zeppelin
SPARK_DEFAULTS=/usr/lib/spark/conf/spark-defaults.conf
declare -a ZEPPELIN_JAVA_OPTS
if [ -f $SPARK_DEFAULTS ]; then
ZEPPELIN_JAVA_OPTS=("${ZEPPELIN_JAVA_OPTS[@]}" \
$(grep spark.executor.instances $SPARK_DEFAULTS | awk '{print "-D" $1 "=" $2}'))
ZEPPELIN_JAVA_OPTS=("${ZEPPELIN_JAVA_OPTS[@]}" \
$(grep spark.executor.cores $SPARK_DEFAULTS | awk '{print "-D" $1 "=" $2}'))
ZEPPELIN_JAVA_OPTS=("${ZEPPELIN_JAVA_OPTS[@]}" \
$(grep spark.executor.memory $SPARK_DEFAULTS | awk '{print "-D" $1 "=" $2}'))
ZEPPELIN_JAVA_OPTS=("${ZEPPELIN_JAVA_OPTS[@]}" \
$(grep spark.default.parallelism $SPARK_DEFAULTS | awk '{print "-D" $1 "=" $2}'))
ZEPPELIN_JAVA_OPTS=("${ZEPPELIN_JAVA_OPTS[@]}" \
$(grep spark.yarn.executor.memoryOverhead $SPARK_DEFAULTS | awk '{print "-D" $1 "=" $2}'))
fi
echo "${ZEPPELIN_JAVA_OPTS[@]}"
cp conf/zeppelin-env.sh.template conf/zeppelin-env.sh
cat <<EOF >> conf/zeppelin-env.sh
export MASTER=yarn-client
export HADOOP_HOME=/usr/lib/hadoop
export HADOOP_CONF_DIR=/etc/hadoop/conf
export ZEPPELIN_SPARK_USEHIVECONTEXT=false
export ZEPPELIN_JAVA_OPTS="${ZEPPELIN_JAVA_OPTS[@]}"
export PYTHONPATH=$PYTHONPATH:/usr/lib/spark/python
#export ZEPPELIN_NOTEBOOK_S3_BUCKET=<myZeppelinBucket>
#export ZEPPELIN_NOTEBOOK_USER=<myZeppelinUser>
EOF
cat <<'EOF' > 0001-Add-Hadoop-libraries-and-EMRFS-to-Zeppelin-classpath.patch
From 2b0226e45207758d526522bd22d497c9def7c008 Mon Sep 17 00:00:00 2001
From: Anders Hammar <anders.hammar@gmail.com>
Date: Fri, 18 Sep 2015 10:24:18 +0000
Subject: [PATCH] Add Hadoop libraries and EMRFS to Zeppelin classpath
---
bin/interpreter.sh | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/bin/interpreter.sh b/bin/interpreter.sh
index e03a13b..de458f2 100755
--- a/bin/interpreter.sh
+++ b/bin/interpreter.sh
@@ -89,8 +89,21 @@ if [[ "${INTERPRETER_ID}" == "spark" ]]; then
# CDH
addJarInDir "${HADOOP_HOME}"
addJarInDir "${HADOOP_HOME}/lib"
+
+ # Hadoop libraries
+ addJarInDir "${HADOOP_HOME}/../hadoop-hdfs"
+ addJarInDir "${HADOOP_HOME}/../hadoop-mapreduce"
+ addJarInDir "${HADOOP_HOME}/../hadoop-yarn"
+
+ # Hadoop LZO
+ addJarInDir "${HADOOP_HOME}/../hadoop-lzo/lib"
fi
+ # Add EMRFS libraries
+ addJarInDir "/usr/share/aws/emr/emrfs/conf"
+ addJarInDir "/usr/share/aws/emr/emrfs/lib"
+ addJarInDir "/usr/share/aws/emr/emrfs/auxlib"
+
addJarInDir "${INTERPRETER_DIR}/dep"
PYSPARKPATH="${ZEPPELIN_HOME}/interpreter/spark/pyspark/pyspark.zip:${ZEPPELIN_HOME}/interpreter/spark/pyspark/py4j-0.8.2.1-src.zip"
--
2.1.0
EOF
git config user.email "eyelove@gmail.com"
git config user.name "Kim DeokJung"
git am 0001-Add-Hadoop-libraries-and-EMRFS-to-Zeppelin-classpath.patch
# Start the Zeppelin daemon
bin/zeppelin-daemon.sh start
fi