Skip to content

Instantly share code, notes, and snippets.

@michaelmior
Created May 3, 2017 18:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save michaelmior/56e8a903d85862a6793870f0d99070af to your computer and use it in GitHub Desktop.
Save michaelmior/56e8a903d85862a6793870f0d99070af to your computer and use it in GitHub Desktop.
Spark installation on YARN
#!/bin/bash -ex
SPARK_VERSION="2.1.0-bin-hadoop2.7"
HADOOP_VERSION="2.7.3"
SHORT_USER=$(echo $USER | cut -d \\ -f2)
function add_to_env {
host=$1
value=$2
ssh $host "echo '$value' | sudo tee -a /etc/environment > /dev/null"
}
# Function to add properties to a configuration file on a remote host
function add_property {
host=$1
name=$2
value=$3
file=$4
ssh $host "sudo xmlstarlet ed -L \
-s '/configuration' -t elem -n property --var new-field '\$prev' \
-s '\$new-field' -t elem -n name -v $name \
-s '\$new-field' -t elem -n value -v $value \
$file"
}
# Check for correct arguments
if [ "$#" -lt 3 ]; then
echo "Usage: $0 NameNode ResourceManager [slave1 slave2 ...]" > /dev/stderr
exit 1
fi
echo "Downloading tarballs" > /dev/stderr
wget -P ~ -c http://apache.mirrors.spacedump.net/hadoop/common/stable/hadoop-$HADOOP_VERSION.tar.gz
wget -P ~ -c http://d3kbcqa49mib13.cloudfront.net/spark-$SPARK_VERSION.tgz
# Get hostnames
namenode="$1"
resourcemanager="$2"
for host in "$@"; do
echo "Installing on $host..." > /dev/stderr
# Enable /etc/environment with sudo
ssh $host "echo 'session required pam_env.so readenv=1' | sudo tee -a /etc/pam.d/sudo > /dev/null"
# Properly configure environment
add_to_env $host "JAVA_HOME=\"/usr/lib/jvm/java-8-oracle/\""
add_to_env $host "HADOOP_HOME=\"/opt/hadoop-$HADOOP_VERSION\""
add_to_env $host "HADOOP_PREFIX=\"/opt/hadoop-$HADOOP_VERSION\""
add_to_env $host "HADOOP_COMMON_HOME=\"/opt/hadoop-$HADOOP_VERSION\""
add_to_env $host "HADOOP_CONF_DIR=\"/opt/hadoop-$HADOOP_VERSION/etc/hadoop\""
add_to_env $host "HADOOP_HDFS_HOME=\"/opt/hadoop-$HADOOP_VERSION\""
add_to_env $host "HADOOP_YARN_HOME=\"/opt/hadoop-$HADOOP_VERSION\""
add_to_env $host "SPARK_HOME=\"/opt/spark-$SPARK_VERSION\""
# This step is required so daemons listen on the correct interface
ssh $host sudo sed -i "/$host/d" /etc/hosts
ssh $host sudo tar zxf spark-$SPARK_VERSION.tgz -C /opt
ssh $host sudo tar zxf hadoop-$HADOOP_VERSION.tar.gz -C /opt
# Install xmlstarlet to make manipulating configs easier
ssh $host sudo apt-get update -qq && ssh $host sudo apt-get install -qq xmlstarlet
# Create HDFS directory
ssh $host sudo rm -rf /ssd1/$SHORT_USER/hdfs
ssh $host sudo mkdir -p /ssd1/$SHORT_USER/hdfs/datanode
ssh $host sudo mkdir -p /ssd1/$SHORT_USER/hdfs/namenode
ssh $host sudo mkdir -p /ssd1/$SHORT_USER/tmp
add_property $host \
dfs.datanode.data.dir \
file:///ssd1/$SHORT_USER/hdfs/datanode \
/opt/hadoop-$HADOOP_VERSION/etc/hadoop/hdfs-site.xml
add_property $host \
dfs.namenode.name.dir \
file:///ssd1/$SHORT_USER/hdfs/namenode \
/opt/hadoop-$HADOOP_VERSION/etc/hadoop/hdfs-site.xml
add_property $host \
dfs.namenode.datanode.registration.ip-hostname-check \
false \
/opt/hadoop-$HADOOP_VERSION/etc/hadoop/hdfs-site.xml
add_property $host \
hadoop.tmp.dir \
/ssd1/$SHORT_USER/tmp \
/opt/hadoop-$HADOOP_VERSION/etc/hadoop/core-site.xml
# Set the NameNode and ResourceManager
add_property $host \
fs.defaultFS \
hdfs://$namenode \
/opt/hadoop-$HADOOP_VERSION/etc/hadoop/core-site.xml
add_property $host \
yarn.resourcemanager.hostname \
$resourcemanager \
/opt/hadoop-$HADOOP_VERSION/etc/hadoop/yarn-site.xml
# Configure YARN resource limits
add_property $host \
yarn.nodemanager.resource.memory-mb \
49152 \
/opt/hadoop-$HADOOP_VERSION/etc/hadoop/yarn-site.xml
add_property $host \
yarn.nodemanager.resource.cpu-vcores \
10 \
/opt/hadoop-$HADOOP_VERSION/etc/hadoop/yarn-site.xml
add_property $host \
yarn.nodemanager.vmem-check-enabled \
false \
/opt/hadoop-$HADOOP_VERSION/etc/hadoop/yarn-site.xml
done
# Remove non-slaves from arguments
shift 2
# Format HDFS and start the NameNode
echo "Starting NameNode on $namenode" > /dev/stderr
ssh $namenode sudo /opt/hadoop-$HADOOP_VERSION/bin/hdfs namenode -format
ssh $namenode sudo /opt/hadoop-$HADOOP_VERSION/sbin/hadoop-daemon.sh start namenode
# Start DataNodes
for slave in "$@"; do
echo "Starting DataNode on $slave" > /dev/stderr
ssh $slave sudo /opt/hadoop-$HADOOP_VERSION/sbin/hadoop-daemon.sh start datanode
done
# Create a directory for Spark event logs
ssh $namenode sudo /opt/hadoop-$HADOOP_VERSION/bin/hdfs dfs -mkdir /spark-logs
# Start ResourceManager
echo "Starting ResourceManager on $resourcemanager" > /dev/stderr
ssh $resourcemanager sudo /opt/hadoop-$HADOOP_VERSION/sbin/yarn-daemon.sh start resourcemanager
# Start NodeManagers
for slave in "$@"; do
echo "Starting NodeManager on $host" > /dev/stderr
ssh $slave sudo /opt/hadoop-$HADOOP_VERSION/sbin/yarn-daemon.sh start nodemanager
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment