Skip to content

Instantly share code, notes, and snippets.

@michaelmior michaelmior/start-spark.sh
Last active May 10, 2017

Embed
What would you like to do?
#!/bin/bash -e
SPARK_VERSION="2.1.0-bin-hadoop2.7"
HADOOP_VERSION="2.7.3"
SHORT_USER=$(echo $USER | cut -d \\ -f2)
DATA_DIR=/var/tmp/$SHORT_USER/hdfs
IO_BPS=$((50 * 1024 * 1024))
function add_to_env {
host=$1
value=$2
ssh $host "echo '$value' | sudo tee -a /etc/environment > /dev/null"
}
# Function to add properties to a configuration file on a remote host
function add_property {
host=$1
name=$2
value=$3
file=$4
ssh $host "sudo xmlstarlet ed -L \
-s '/configuration' -t elem -n property --var new-field '\$prev' \
-s '\$new-field' -t elem -n name -v $name \
-s '\$new-field' -t elem -n value -v $value \
$file"
}
# Check for correct arguments
if [ "$#" -lt 3 ]; then
echo "Usage: $0 NameNode ResourceManager [slave1 slave2 ...]" > /dev/stderr
exit 1
fi
echo "Downloading tarballs" > /dev/stderr
wget -P ~ -c http://apache.mirrors.spacedump.net/hadoop/common/stable/hadoop-$HADOOP_VERSION.tar.gz
wget -P ~ -c http://d3kbcqa49mib13.cloudfront.net/spark-$SPARK_VERSION.tgz
# Get hostnames
namenode="$1"
resourcemanager="$2"
for host in "$@"; do
echo "Installing on $host..." > /dev/stderr
# Enable /etc/environment with sudo
ssh $host "echo 'session required pam_env.so readenv=1' | sudo tee -a /etc/pam.d/sudo > /dev/null"
# Properly configure environment
add_to_env $host "JAVA_HOME=\"/usr/lib/jvm/java-8-oracle/\""
add_to_env $host "HADOOP_HOME=\"/opt/hadoop-$HADOOP_VERSION\""
add_to_env $host "HADOOP_PREFIX=\"/opt/hadoop-$HADOOP_VERSION\""
add_to_env $host "HADOOP_COMMON_HOME=\"/opt/hadoop-$HADOOP_VERSION\""
add_to_env $host "HADOOP_CONF_DIR=\"/opt/hadoop-$HADOOP_VERSION/etc/hadoop\""
add_to_env $host "HADOOP_HDFS_HOME=\"/opt/hadoop-$HADOOP_VERSION\""
add_to_env $host "HADOOP_YARN_HOME=\"/opt/hadoop-$HADOOP_VERSION\""
add_to_env $host "SPARK_HOME=\"/opt/spark-$SPARK_VERSION\""
# This step is required so daemons listen on the correct interface
ssh $host sudo sed -i "/$host/d" /etc/hosts
ssh $host sudo tar zxf spark-$SPARK_VERSION.tgz -C /opt
ssh $host sudo tar zxf hadoop-$HADOOP_VERSION.tar.gz -C /opt
# Install xmlstarlet to make manipulating configs easier
ssh $host sudo apt-get update -qq && ssh $host sudo apt-get install -qq cgroup-bin xmlstarlet
# Create HDFS directory
ssh $host sudo rm -rf $DATA_DIR
ssh $host sudo mkdir -p $DATA_DIR/hdfs/datanode
ssh $host sudo mkdir -p $DATA_DIR/hdfs/namenode
ssh $host sudo mkdir -p $DATA_DIR/tmp
add_property $host \
dfs.datanode.data.dir \
file://$DATA_DIR/hdfs/datanode \
/opt/hadoop-$HADOOP_VERSION/etc/hadoop/hdfs-site.xml
add_property $host \
dfs.namenode.name.dir \
file://$DATA_DIR/hdfs/namenode \
/opt/hadoop-$HADOOP_VERSION/etc/hadoop/hdfs-site.xml
add_property $host \
dfs.namenode.datanode.registration.ip-hostname-check \
false \
/opt/hadoop-$HADOOP_VERSION/etc/hadoop/hdfs-site.xml
add_property $host \
hadoop.tmp.dir \
$DATA_DIR/tmp \
/opt/hadoop-$HADOOP_VERSION/etc/hadoop/core-site.xml
# Set the NameNode and ResourceManager
add_property $host \
fs.defaultFS \
hdfs://$namenode \
/opt/hadoop-$HADOOP_VERSION/etc/hadoop/core-site.xml
add_property $host \
yarn.resourcemanager.hostname \
$resourcemanager \
/opt/hadoop-$HADOOP_VERSION/etc/hadoop/yarn-site.xml
# Configure YARN resource limits
add_property $host \
yarn.nodemanager.resource.memory-mb \
49152 \
/opt/hadoop-$HADOOP_VERSION/etc/hadoop/yarn-site.xml
add_property $host \
yarn.nodemanager.resource.cpu-vcores \
10 \
/opt/hadoop-$HADOOP_VERSION/etc/hadoop/yarn-site.xml
add_property $host \
yarn.nodemanager.vmem-check-enabled \
false \
/opt/hadoop-$HADOOP_VERSION/etc/hadoop/yarn-site.xml
done
# Remove non-slaves from arguments
shift 2
# Format HDFS and start the NameNode
echo "Starting NameNode on $namenode" > /dev/stderr
ssh $namenode sudo /opt/hadoop-$HADOOP_VERSION/bin/hdfs namenode -format
ssh $namenode sudo /opt/hadoop-$HADOOP_VERSION/sbin/hadoop-daemon.sh start namenode
# Start DataNodes
for slave in "$@"; do
echo "Starting DataNode on $slave" > /dev/stderr
ssh $slave sudo /opt/hadoop-$HADOOP_VERSION/sbin/hadoop-daemon.sh start datanode
# Optionally introduce a bandwidth limit on HDFS
if [ "$IO_BPS" -gt 0 ]; then
device=$(ssh $slave df $DATA_DIR | tail -1 | awk '{ print $1 }' | tr -d [:digit:])
ssh $slave "sudo sh -c '(mount | grep blkio > /dev/null) || (mkdir -p /cgroup/blkio && mount -t cgroup -o blkio none /cgroup/blkio)'"
ssh $slave sudo cgcreate -g blkio:/iothrottle
ssh $slave sudo "cgset -r blkio.throttle.read_bps_device=\"\$(ls -ls $device | awk '{ print \$6\$7 }' | tr , :) $IO_BPS\" iothrottle"
ssh $slave sudo "cgset -r blkio.throttle.write_bps_device=\"\$(ls -ls $device | awk '{ print \$6\$7 }' | tr , :) $IO_BPS\" iothrottle"
ssh $slave "sudo sh -c 'jps | grep DataNode | cut -d \" \" -f1 > /cgroup/blkio/iothrottle/tasks'"
fi
done
# Create a directory for Spark event logs
ssh $namenode sudo /opt/hadoop-$HADOOP_VERSION/bin/hdfs dfs -mkdir /spark-logs
# Start ResourceManager
echo "Starting ResourceManager on $resourcemanager" > /dev/stderr
ssh $resourcemanager sudo /opt/hadoop-$HADOOP_VERSION/sbin/yarn-daemon.sh start resourcemanager
# Start NodeManagers
for slave in "$@"; do
echo "Starting NodeManager on $host" > /dev/stderr
ssh $slave sudo /opt/hadoop-$HADOOP_VERSION/sbin/yarn-daemon.sh start nodemanager
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.