dlecocq/0-dependencies.sh

## 0-dependencies.sh
# Be sure to export the following before running this:
#    HOSTNAME = 'name of the host'
#    NAMENODE = 'IP address of the name node'
#    JOBTRACK = 'IP address of job tracker'
#    ZOOKEEPR = 'IP address of zoo keeper'
#    MASTER   = 'Inernal domain name of master'
#    TABLETS  = 'newline-separated list of table servers'

# First, we should set the hostname of the machine
sudo hostname $HOSTNAME
echo "127.0.0.1 $HOSTNAME" | sudo tee -a /etc/hosts

# The first step from http://www.accumulodata.com/ec2.html
RELEASE=`lsb_release -c | awk {'print $2'}`

curl -s http://archive.cloudera.com/debian/archive.key | sudo apt-key add -

sudo apt-get install python-software-properties -y
sudo add-apt-repository "deb http://archive.canonical.com/ $RELEASE partner"
sudo add-apt-repository "deb http://archive.ubuntu.com/ubuntu/ $RELEASE multiverse"
sudo add-apt-repository "deb http://archive.cloudera.com/debian $RELEASE-cdh3u2 contrib"

sudo apt-get update

cat << EOD | sudo debconf-set-selections
sun-java6-jdk shared/accepted-sun-dlj-v1-1 select true
sun-java6-jre shared/accepted-sun-dlj-v1-1 select true
EOD

sudo dpkg --set-selections <<EOS
sun-java6-jdk install
EOS

# Now the actual packages we were after
sudo apt-get install -y gcc g++ hadoop-0.20 hadoop-0.20-datanode hadoop-0.20-tasktracker \
     hadoop-zookeeper hadoop-zookeeper-server hadoop-0.20-namenode hadoop-0.20-jobtracker

## 1-accumulo.sh
export ACCUMULO_VERSION=1.4.1

# Download the distribution of accumulo
cd && curl -OL http://mirror.cc.columbia.edu/pub/software/apache/accumulo/$ACCUMULO_VERSION/accumulo-$ACCUMULO_VERSION-dist.tar.gz

# Unpack and link
tar -xzf accumulo-$ACCUMULO_VERSION-dist.tar.gz
ln -s accumulo-$ACCUMULO_VERSION accumulo

# And copy some binaries
sudo cp accumulo/lib/accumulo-core-$ACCUMULO_VERSION.jar /usr/lib/hadoop/lib/
sudo cp accumulo/lib/log4j-1.2.16.jar /usr/lib/hadoop/lib/
sudo cp accumulo/lib/libthrift-0.6.1.jar /usr/lib/hadoop/lib/
sudo cp accumulo/lib/cloudtrace-$ACCUMULO_VERSION.jar /usr/lib/hadoop/lib/
sudo cp /usr/lib/zookeeper/zookeeper.jar /usr/lib/hadoop/lib/

## 2-operating-system.sh
# Swappiness
sudo sysctl -w vm.swappiness=0

# And open file descriptor limits
echo -e "ubuntu\t\tsoft\tnofile\t65536" | sudo tee --append /etc/security/limits.conf
echo -e "ubuntu\t\thard\tnofile\t65536" | sudo tee --append /etc/security/limits.conf

## 3-storage.sh
sudo apt-get install xfsprogs -y;

# Let's set up the first drive...
sudo umount /mnt;
sudo /sbin/mkfs.xfs -f /dev/sdb;
sudo mount -o noatime /dev/sdb /mnt;

# And now for the second
sudo umount /mnt2;
sudo /sbin/mkfs.xfs -f /dev/sdc;
sudo mount -o noatime /dev/sdc /mnt2;

# Make sure we own them
sudo chown -R ubuntu /mnt
sudo chown -R ubuntu /mnt2

# And set up a few folders
mkdir /mnt/walogs
# HDFS, obviously
mkdir -p /mnt/hdfs      && sudo chown -R hdfs /mnt/hdfs
mkdir -p /mnt2/hdfs     && sudo chown -R hdfs /mnt2/hdfs
mkdir -p /mnt/namenode  && sudo chown -R hdfs /mnt/namenode
# And map-reduce
mkdir -p /mnt/mapred    && sudo chown -R mapred /mnt/mapred
mkdir -p /mnt2/mapred   && sudo chown -R mapred /mnt2/mapred

## 4-config.sh
########
# Hadoop
# Add this line to the hadoop config
echo 'export JAVA_HOME=/usr/lib/jvm/java-6-openjdk/' | sudo tee -a /etc/hadoop/conf/hadoop-env.sh

# Update this in /etc/hadoop/conf/core-site.xml
sudo sed -i 's/<\/configuration>//' /etc/hadoop/conf/core-site.xml
echo "
  <property>
    <name>fs.default.name</name>
    <value>hdfs://$NAMENODE:9000</value>
  </property>
</configuration>" | sudo tee -a /etc/hadoop/conf/core-site.xml

######
# HDFS
# Update this in /etc/hadoop/conf/hdfs-site.xml
# First, we delete the "</configuration>" line, then we'll replace it with more
sudo sed -i 's/<\/configuration>//' /etc/hadoop/conf/hdfs-site.xml
echo "
  <property>
    <name>dfs.name.dir</name>
    <value>/mnt/namenode</value>
  </property>

  <property>
    <name>dfs.data.dir</name>
    <value>/mnt/hdfs,/mnt2/hdfs</value>
  </property>

  <property>
    <name>dfs.replication</name>
    <value>3</value>
  </property>
</configuration>" | sudo tee -a /etc/hadoop/conf/hdfs-site.xml

###########
# MapReduce
# Update this in /etc/hadoop/conf/mapred-site.xml
# First, strip off the trailing configuration line, and then we'll append more to it
sudo sed -i 's/<\/configuration>//' /etc/hadoop/conf/mapred-site.xml
echo "
  <property>
    <name>mapred.job.tracker</name>
    <value>$JOBTRACK:9001</value>
  </property>

  <property>
    <name>mapred.local.dir</name>
    <value>/mnt/mapred,/mnt2/mapred</value>
  </property>

  <property>
    <name>mapred.child.java.opts</name>
    <value>-Xmx1024m</value>
  </property>
</configuration>" | sudo tee -a /etc/hadoop/conf/mapred-site.xml

###########
# ZooKeeper
echo "maxClientCnxns=250" | sudo tee -a /etc/zookeeper/zoo.cfg
maxClientCnxns=250

## 5-accumulo-config.sh
# Add each of these to accumulo/conf/accumulo-env.sh
echo 'test -z "$JAVA_HOME" && export JAVA_HOME="/usr/lib/jvm/java-6-openjdk/jre/"
test -z "$HADOOP_HOME" && export HADOOP_HOME="/usr/lib/hadoop/"
test -z "$ZOOKEEPER_HOME" && export ZOOKEEPER_HOME="/usr/lib/zookeeper/"' | \
     sudo tee -a accumulo/conf/accumulo-env.sh

# Update accumulo/conf/accumulo-site.xml
echo "<property>
<name>instance.zookeeper.host</name>
<value>$ZOOKEEPR:2181</value>
</property>

<property>
<name>logger.dir.walog</name>
<value>/mnt/walogs</value>
</property>

<property>
<name>instance.secret</name>
<value>DEFAULT</value>
</property>

<property>
<name>tserver.memory.maps.max</name>
<value>1G</value>
</property>" | sudo tee -a accumulo/conf/accumulo-site.xml

# Internal DNS name of master in accumulo/conf/masters
echo $MASTER | sudo tee -a accumulo/conf/masters

# Tablet server DNS names
echo $TABLETS | sudo tee -a accumulo/conf/slaves
	# Be sure to export the following before running this:
	# HOSTNAME = 'name of the host'
	# NAMENODE = 'IP address of the name node'
	# JOBTRACK = 'IP address of job tracker'
	# ZOOKEEPR = 'IP address of zoo keeper'
	# MASTER = 'Inernal domain name of master'
	# TABLETS = 'newline-separated list of table servers'

	# First, we should set the hostname of the machine
	sudo hostname $HOSTNAME
	echo "127.0.0.1 $HOSTNAME" \| sudo tee -a /etc/hosts

	# The first step from http://www.accumulodata.com/ec2.html
	RELEASE=`lsb_release -c \| awk {'print $2'}`

	curl -s http://archive.cloudera.com/debian/archive.key \| sudo apt-key add -

	sudo apt-get install python-software-properties -y
	sudo add-apt-repository "deb http://archive.canonical.com/ $RELEASE partner"
	sudo add-apt-repository "deb http://archive.ubuntu.com/ubuntu/ $RELEASE multiverse"
	sudo add-apt-repository "deb http://archive.cloudera.com/debian $RELEASE-cdh3u2 contrib"

	sudo apt-get update

	cat << EOD \| sudo debconf-set-selections
	sun-java6-jdk shared/accepted-sun-dlj-v1-1 select true
	sun-java6-jre shared/accepted-sun-dlj-v1-1 select true
	EOD

	sudo dpkg --set-selections <<EOS
	sun-java6-jdk install
	EOS

	# Now the actual packages we were after
	sudo apt-get install -y gcc g++ hadoop-0.20 hadoop-0.20-datanode hadoop-0.20-tasktracker \
	hadoop-zookeeper hadoop-zookeeper-server hadoop-0.20-namenode hadoop-0.20-jobtracker
	export ACCUMULO_VERSION=1.4.1

	# Download the distribution of accumulo
	cd && curl -OL http://mirror.cc.columbia.edu/pub/software/apache/accumulo/$ACCUMULO_VERSION/accumulo-$ACCUMULO_VERSION-dist.tar.gz

	# Unpack and link
	tar -xzf accumulo-$ACCUMULO_VERSION-dist.tar.gz
	ln -s accumulo-$ACCUMULO_VERSION accumulo

	# And copy some binaries
	sudo cp accumulo/lib/accumulo-core-$ACCUMULO_VERSION.jar /usr/lib/hadoop/lib/
	sudo cp accumulo/lib/log4j-1.2.16.jar /usr/lib/hadoop/lib/
	sudo cp accumulo/lib/libthrift-0.6.1.jar /usr/lib/hadoop/lib/
	sudo cp accumulo/lib/cloudtrace-$ACCUMULO_VERSION.jar /usr/lib/hadoop/lib/
	sudo cp /usr/lib/zookeeper/zookeeper.jar /usr/lib/hadoop/lib/
	# Swappiness
	sudo sysctl -w vm.swappiness=0

	# And open file descriptor limits
	echo -e "ubuntu\t\tsoft\tnofile\t65536" \| sudo tee --append /etc/security/limits.conf
	echo -e "ubuntu\t\thard\tnofile\t65536" \| sudo tee --append /etc/security/limits.conf
	sudo apt-get install xfsprogs -y;

	# Let's set up the first drive...
	sudo umount /mnt;
	sudo /sbin/mkfs.xfs -f /dev/sdb;
	sudo mount -o noatime /dev/sdb /mnt;

	# And now for the second
	sudo umount /mnt2;
	sudo /sbin/mkfs.xfs -f /dev/sdc;
	sudo mount -o noatime /dev/sdc /mnt2;

	# Make sure we own them
	sudo chown -R ubuntu /mnt
	sudo chown -R ubuntu /mnt2

	# And set up a few folders
	mkdir /mnt/walogs
	# HDFS, obviously
	mkdir -p /mnt/hdfs && sudo chown -R hdfs /mnt/hdfs
	mkdir -p /mnt2/hdfs && sudo chown -R hdfs /mnt2/hdfs
	mkdir -p /mnt/namenode && sudo chown -R hdfs /mnt/namenode
	# And map-reduce
	mkdir -p /mnt/mapred && sudo chown -R mapred /mnt/mapred
	mkdir -p /mnt2/mapred && sudo chown -R mapred /mnt2/mapred
	########
	# Hadoop
	# Add this line to the hadoop config
	echo 'export JAVA_HOME=/usr/lib/jvm/java-6-openjdk/' \| sudo tee -a /etc/hadoop/conf/hadoop-env.sh

	# Update this in /etc/hadoop/conf/core-site.xml
	sudo sed -i 's/<\/configuration>//' /etc/hadoop/conf/core-site.xml
	echo "
	<property>
	<name>fs.default.name</name>
	<value>hdfs://$NAMENODE:9000</value>
	</property>
	</configuration>" \| sudo tee -a /etc/hadoop/conf/core-site.xml

	######
	# HDFS
	# Update this in /etc/hadoop/conf/hdfs-site.xml
	# First, we delete the "</configuration>" line, then we'll replace it with more
	sudo sed -i 's/<\/configuration>//' /etc/hadoop/conf/hdfs-site.xml
	echo "
	<property>
	<name>dfs.name.dir</name>
	<value>/mnt/namenode</value>
	</property>

	<property>
	<name>dfs.data.dir</name>
	<value>/mnt/hdfs,/mnt2/hdfs</value>
	</property>

	<property>
	<name>dfs.replication</name>
	<value>3</value>
	</property>
	</configuration>" \| sudo tee -a /etc/hadoop/conf/hdfs-site.xml

	###########
	# MapReduce
	# Update this in /etc/hadoop/conf/mapred-site.xml
	# First, strip off the trailing configuration line, and then we'll append more to it
	sudo sed -i 's/<\/configuration>//' /etc/hadoop/conf/mapred-site.xml
	echo "
	<property>
	<name>mapred.job.tracker</name>
	<value>$JOBTRACK:9001</value>
	</property>

	<property>
	<name>mapred.local.dir</name>
	<value>/mnt/mapred,/mnt2/mapred</value>
	</property>

	<property>
	<name>mapred.child.java.opts</name>
	<value>-Xmx1024m</value>
	</property>
	</configuration>" \| sudo tee -a /etc/hadoop/conf/mapred-site.xml

	###########
	# ZooKeeper
	echo "maxClientCnxns=250" \| sudo tee -a /etc/zookeeper/zoo.cfg
	maxClientCnxns=250
	# Add each of these to accumulo/conf/accumulo-env.sh
	echo 'test -z "$JAVA_HOME" && export JAVA_HOME="/usr/lib/jvm/java-6-openjdk/jre/"
	test -z "$HADOOP_HOME" && export HADOOP_HOME="/usr/lib/hadoop/"
	test -z "$ZOOKEEPER_HOME" && export ZOOKEEPER_HOME="/usr/lib/zookeeper/"' \| \
	sudo tee -a accumulo/conf/accumulo-env.sh

	# Update accumulo/conf/accumulo-site.xml
	echo "<property>
	<name>instance.zookeeper.host</name>
	<value>$ZOOKEEPR:2181</value>
	</property>

	<property>
	<name>logger.dir.walog</name>
	<value>/mnt/walogs</value>
	</property>

	<property>
	<name>instance.secret</name>
	<value>DEFAULT</value>
	</property>

	<property>
	<name>tserver.memory.maps.max</name>
	<value>1G</value>
	</property>" \| sudo tee -a accumulo/conf/accumulo-site.xml

	# Internal DNS name of master in accumulo/conf/masters
	echo $MASTER \| sudo tee -a accumulo/conf/masters

	# Tablet server DNS names
	echo $TABLETS \| sudo tee -a accumulo/conf/slaves