Skip to content

Instantly share code, notes, and snippets.

@dlecocq
Created July 13, 2012 21:45
Show Gist options
  • Save dlecocq/3107736 to your computer and use it in GitHub Desktop.
Save dlecocq/3107736 to your computer and use it in GitHub Desktop.
Accumulo Bootstrap
# Be sure to export the following before running this:
# HOSTNAME = 'name of the host'
# NAMENODE = 'IP address of the name node'
# JOBTRACK = 'IP address of job tracker'
# ZOOKEEPR = 'IP address of zoo keeper'
# MASTER = 'Inernal domain name of master'
# TABLETS = 'newline-separated list of table servers'
# First, we should set the hostname of the machine
sudo hostname $HOSTNAME
echo "127.0.0.1 $HOSTNAME" | sudo tee -a /etc/hosts
# The first step from http://www.accumulodata.com/ec2.html
RELEASE=`lsb_release -c | awk {'print $2'}`
curl -s http://archive.cloudera.com/debian/archive.key | sudo apt-key add -
sudo apt-get install python-software-properties -y
sudo add-apt-repository "deb http://archive.canonical.com/ $RELEASE partner"
sudo add-apt-repository "deb http://archive.ubuntu.com/ubuntu/ $RELEASE multiverse"
sudo add-apt-repository "deb http://archive.cloudera.com/debian $RELEASE-cdh3u2 contrib"
sudo apt-get update
cat << EOD | sudo debconf-set-selections
sun-java6-jdk shared/accepted-sun-dlj-v1-1 select true
sun-java6-jre shared/accepted-sun-dlj-v1-1 select true
EOD
sudo dpkg --set-selections <<EOS
sun-java6-jdk install
EOS
# Now the actual packages we were after
sudo apt-get install -y gcc g++ hadoop-0.20 hadoop-0.20-datanode hadoop-0.20-tasktracker \
hadoop-zookeeper hadoop-zookeeper-server hadoop-0.20-namenode hadoop-0.20-jobtracker
export ACCUMULO_VERSION=1.4.1
# Download the distribution of accumulo
cd && curl -OL http://mirror.cc.columbia.edu/pub/software/apache/accumulo/$ACCUMULO_VERSION/accumulo-$ACCUMULO_VERSION-dist.tar.gz
# Unpack and link
tar -xzf accumulo-$ACCUMULO_VERSION-dist.tar.gz
ln -s accumulo-$ACCUMULO_VERSION accumulo
# And copy some binaries
sudo cp accumulo/lib/accumulo-core-$ACCUMULO_VERSION.jar /usr/lib/hadoop/lib/
sudo cp accumulo/lib/log4j-1.2.16.jar /usr/lib/hadoop/lib/
sudo cp accumulo/lib/libthrift-0.6.1.jar /usr/lib/hadoop/lib/
sudo cp accumulo/lib/cloudtrace-$ACCUMULO_VERSION.jar /usr/lib/hadoop/lib/
sudo cp /usr/lib/zookeeper/zookeeper.jar /usr/lib/hadoop/lib/
# Swappiness
sudo sysctl -w vm.swappiness=0
# And open file descriptor limits
echo -e "ubuntu\t\tsoft\tnofile\t65536" | sudo tee --append /etc/security/limits.conf
echo -e "ubuntu\t\thard\tnofile\t65536" | sudo tee --append /etc/security/limits.conf
sudo apt-get install xfsprogs -y;
# Let's set up the first drive...
sudo umount /mnt;
sudo /sbin/mkfs.xfs -f /dev/sdb;
sudo mount -o noatime /dev/sdb /mnt;
# And now for the second
sudo umount /mnt2;
sudo /sbin/mkfs.xfs -f /dev/sdc;
sudo mount -o noatime /dev/sdc /mnt2;
# Make sure we own them
sudo chown -R ubuntu /mnt
sudo chown -R ubuntu /mnt2
# And set up a few folders
mkdir /mnt/walogs
# HDFS, obviously
mkdir -p /mnt/hdfs && sudo chown -R hdfs /mnt/hdfs
mkdir -p /mnt2/hdfs && sudo chown -R hdfs /mnt2/hdfs
mkdir -p /mnt/namenode && sudo chown -R hdfs /mnt/namenode
# And map-reduce
mkdir -p /mnt/mapred && sudo chown -R mapred /mnt/mapred
mkdir -p /mnt2/mapred && sudo chown -R mapred /mnt2/mapred
########
# Hadoop
# Add this line to the hadoop config
echo 'export JAVA_HOME=/usr/lib/jvm/java-6-openjdk/' | sudo tee -a /etc/hadoop/conf/hadoop-env.sh
# Update this in /etc/hadoop/conf/core-site.xml
sudo sed -i 's/<\/configuration>//' /etc/hadoop/conf/core-site.xml
echo "
<property>
<name>fs.default.name</name>
<value>hdfs://$NAMENODE:9000</value>
</property>
</configuration>" | sudo tee -a /etc/hadoop/conf/core-site.xml
######
# HDFS
# Update this in /etc/hadoop/conf/hdfs-site.xml
# First, we delete the "</configuration>" line, then we'll replace it with more
sudo sed -i 's/<\/configuration>//' /etc/hadoop/conf/hdfs-site.xml
echo "
<property>
<name>dfs.name.dir</name>
<value>/mnt/namenode</value>
</property>
<property>
<name>dfs.data.dir</name>
<value>/mnt/hdfs,/mnt2/hdfs</value>
</property>
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
</configuration>" | sudo tee -a /etc/hadoop/conf/hdfs-site.xml
###########
# MapReduce
# Update this in /etc/hadoop/conf/mapred-site.xml
# First, strip off the trailing configuration line, and then we'll append more to it
sudo sed -i 's/<\/configuration>//' /etc/hadoop/conf/mapred-site.xml
echo "
<property>
<name>mapred.job.tracker</name>
<value>$JOBTRACK:9001</value>
</property>
<property>
<name>mapred.local.dir</name>
<value>/mnt/mapred,/mnt2/mapred</value>
</property>
<property>
<name>mapred.child.java.opts</name>
<value>-Xmx1024m</value>
</property>
</configuration>" | sudo tee -a /etc/hadoop/conf/mapred-site.xml
###########
# ZooKeeper
echo "maxClientCnxns=250" | sudo tee -a /etc/zookeeper/zoo.cfg
maxClientCnxns=250
# Add each of these to accumulo/conf/accumulo-env.sh
echo 'test -z "$JAVA_HOME" && export JAVA_HOME="/usr/lib/jvm/java-6-openjdk/jre/"
test -z "$HADOOP_HOME" && export HADOOP_HOME="/usr/lib/hadoop/"
test -z "$ZOOKEEPER_HOME" && export ZOOKEEPER_HOME="/usr/lib/zookeeper/"' | \
sudo tee -a accumulo/conf/accumulo-env.sh
# Update accumulo/conf/accumulo-site.xml
echo "<property>
<name>instance.zookeeper.host</name>
<value>$ZOOKEEPR:2181</value>
</property>
<property>
<name>logger.dir.walog</name>
<value>/mnt/walogs</value>
</property>
<property>
<name>instance.secret</name>
<value>DEFAULT</value>
</property>
<property>
<name>tserver.memory.maps.max</name>
<value>1G</value>
</property>" | sudo tee -a accumulo/conf/accumulo-site.xml
# Internal DNS name of master in accumulo/conf/masters
echo $MASTER | sudo tee -a accumulo/conf/masters
# Tablet server DNS names
echo $TABLETS | sudo tee -a accumulo/conf/slaves
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment