Skip to content

Instantly share code, notes, and snippets.

@mdaniel
Last active March 26, 2017 23:57
Show Gist options
  • Save mdaniel/7220157 to your computer and use it in GitHub Desktop.
Save mdaniel/7220157 to your computer and use it in GitHub Desktop.
Vagrantfile.sh script to provision an EMR compatible (hadoop 1.0.3, JavaSE 1.6) Vagrant instance
#! /bin/sh
set -e
set -x
## this script is run as `sudo /tmp/vagrant-shell` by ~vagrant
#echo '[USER]'
#id -a
#echo '[ENV]'
#env
## this presumes you have a vboxnet through which 'vagant ssh' will connect to your VM
MY_IP=`/sbin/ifconfig -a | sed -ne '/addr:192/s/.*addr:\(192.168[0-9.]*\) .*/\1/p'`
if test -z "$MY_IP"
then
echo "Unable to sniff our your vboxnet interface" >&2
exit 1
fi
## strangely enough, $HOSTNAME isn't set
HOSTNAME=`hostname`
## shorten the typing since we'll use this a lot
HH=/home/hadoop
## set these variables for our use
## don't worry about the ownership for directories we make
## as they will be cleaned up at the end
HADOOP_HOME=${HH}/.versions/1.0.3
# java-6-sun-1.6.0.26
# Java(TM) SE Runtime Environment (build 1.6.0_26-b03)
# Java HotSpot(TM) 64-Bit Server VM (build 20.1-b02, mixed mode)
JAVA_HOME=/usr/lib/jvm/java-6-sun-1.6.0.26
BINARY_DIR=/binaries
## this should be a pointer to ~/.vagrant.d
VAGRANT_D=/vagrant_home
## TODO only check these if in "provision" mode
for i in \
${BINARY_DIR}/hadoop-1.0.3.tar.gz \
${BINARY_DIR}/jdk-6u26-linux-x64.bin \
${VAGRANT_D}/insecure_private_key
do
if test ! -f $i; then
echo "Required file is missing: $i" >&2
exit 1
fi
done
## set my hostname to the actual IP address, to prevent Hadoop from listening on localhost
if ! grep "${MY_IP}.${HOSTNAME}" /etc/hosts >/dev/null
then
sed -i.bak -e "/$HOSTNAME/s/.*/${MY_IP} ${HOSTNAME}/" /etc/hosts
fi
if ! grep '^hadoop' /etc/passwd >/dev/null
then
## doesn't matter what the password is because vagrant can sudo into it
useradd -s /bin/bash -m hadoop
fi
if test ! -d $HADOOP_HOME
then
DN=`dirname $HADOOP_HOME`
mkdir -p $DN
tar xzf ${BINARY_DIR}/hadoop-1.0.3.tar.gz -C $DN
mv ${DN}/hadoop-1.0.3 ${DN}/1.0.3
unset DN
fi
if test ! -d $JAVA_HOME
then
DN=`dirname $JAVA_HOME`
BN=`basename $JAVA_HOME`
mkdir -p ${DN}
TMPF=/tmp/jdk-6u26-linux-x64.bin
cp ${BINARY_DIR}/jdk-6u26-linux-x64.bin $TMPF
chmod 755 $TMPF
# the .bin expands to PWD so change there in a subshell
(cd ${DN}; $TMPF -x; mv jdk1.6.0* ${BN})
unset DN
unset BN
rm $TMPF
unset TMPF
chown -R 0:0 $JAVA_HOME
## openjdk priority is 1xyzRRRR where (x, y, z, %04r)
## so our priority needs to be higher than theirs
pri=91600026
for i in $JAVA_HOME/bin/*
do
bn=`basename $i`
update-alternatives --install /usr/bin/${bn} ${bn} $i $pri
done
unset pri
fi
## move the hadoop.tmp.dir to /mnt/var/hadoop (as used by EMR)
## so it won't get blown away on reboots
if test ! -d /mnt/var/hadoop
then
mkdir -p /mnt/var/hadoop
chown -R hadoop /mnt/var/hadoop
fi
## this directory exists on EMR even though we don't currently use it
if test ! -d /mnt/var/log
then
mkdir -p /mnt/var/log
chown -R hadoop /mnt/var/log
fi
if test ! -f ${HH}/.bashrc.hadoop
then
## the ownership is fixed below
cat > ${HH}/.bashrc.hadoop<<EOD
JAVA_HOME=$JAVA_HOME
export JAVA_HOME
## not strictly needed because of the update-alternatives above
PATH=\$JAVA_HOME/bin:\$PATH
HADOOP_HOME=$HADOOP_HOME
HADOOP_HOME_WARN_SUPPRESS=1
export HADOOP_HOME
export HADOOP_HOME_WARN_SUPPRESS
## amazon puts hadoop-bin at the end of the path for some reason
PATH=\$PATH:\$HADOOP_HOME/bin
EOD
fi
if ! grep ${HH}/bashrc.hadoop ${HH}/.bashrc >/dev/null
then
mv ${HH}/.bashrc ${HH}/.bashrc.ubuntu
cat > ${HH}/.bashrc<<EOD
# test and source ours first because the ubuntu one exits
# in non-interactive shells, which is exactly when we want to set JAVA_HOME
test -f ~/.bashrc.hadoop && source ~/.bashrc.hadoop
test -f ~/.bashrc.ubuntu && source ~/.bashrc.ubuntu
EOD
fi
if ! grep hadoop.tmp.dir ${HADOOP_HOME}/conf/core-site.xml >/dev/null
then
cat > ${HADOOP_HOME}/conf/core-site.xml<<EOD
<configuration>
<property>
<name>hadoop.tmp.dir</name>
<value>/mnt/var/hadoop</value>
</property>
<property>
<name>fs.default.name</name>
<value>hdfs://${HOSTNAME}:9000</value>
</property>
</configuration>
EOD
fi
if ! grep dfs.replication ${HADOOP_HOME}/conf/hdfs-site.xml >/dev/null
then
cat > ${HADOOP_HOME}/conf/hdfs-site.xml<<EOD
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
</configuration>
EOD
fi
if ! grep job.tracker ${HADOOP_HOME}/conf/mapred-site.xml >/dev/null
then
cat > ${HADOOP_HOME}/conf/mapred-site.xml<<EOD
<configuration>
<property>
<name>mapred.job.tracker</name>
<value>${HOSTNAME}:9001</value>
</property>
</configuration>
EOD
fi
if test ! -f ${HH}/.ssh/known_hosts
then
mkdir ${HH}/.ssh || true
chmod 700 ${HH}/.ssh
/bin/echo -e "127.0.0.1\nlocalhost\n${HOSTNAME}\n" | \
ssh-keyscan -f /dev/stdin > ${HH}/.ssh/known_hosts
fi
if test ! -f ${HH}/.ssh/id_rsa
then
cp ${VAGRANT_D}/insecure_private_key ${HH}/.ssh/id_rsa
ssh-keygen -y -f ${HH}/.ssh/id_rsa > ${HH}/.ssh/id_rsa.pub
cat ${HH}/.ssh/id_rsa.pub >> ${HH}/.ssh/authorized_keys
fi
chmod 600 ${HH}/.ssh/*
cat > ${HH}/start-hadoop.sh<<EOD
${HADOOP_HOME}/bin/hadoop namenode -format
${HADOOP_HOME}/bin/start-all.sh
EOD
chmod 755 ${HH}/start-hadoop.sh
chown -R hadoop ${HH}
cat >/dev/null<<COMMENT
## this is the output of running "env" on an EMR instance
TERM=xterm
SHELL=/bin/bash
HADOOP_HOME=/home/hadoop
USER=hadoop
LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib:
MAIL=/var/mail/hadoop
PATH=/usr/local/cuda/bin:/usr/local/bin:/usr/bin:/bin:/usr/local/games:/usr/games:/home/hadoop/bin
PWD=/home/hadoop
JAVA_HOME=/usr/lib/jvm/java-6-sun
LANG=en_US.UTF-8
SHLVL=1
HOME=/home/hadoop
LOGNAME=hadoop
HADOOP_HOME_WARN_SUPPRESS=true
_=/usr/bin/env
COMMENT
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment