tianhuil/install_hadoop.sh

## install_hadoop.sh
# Instaling Ubuntu
# http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH4/latest/CDH4-Quick-Start/cdh4qs_topic_3_2.html

# While, we're at it, let's install the JDK ...
echo "y" | sudo apt-get install openjdk-6-jdk

# ... and Yelp's mrjob
pip install mrjob

# Install maven
echo "y" | sudo apt-get install maven

# download and install this Debian package from Cloudera.
wget http://archive.cloudera.com/cdh4/one-click-install/lucid/amd64/cdh4-repository_1.0_all.deb
sudo dpkg -i cdh4-repository_1.0_all.deb

# add cloudera public key to repository
curl -s http://archive.cloudera.com/cdh4/ubuntu/lucid/amd64/cdh/archive.key | sudo apt-key add -

# install hadoop
echo "y" | sudo apt-get update
echo "y" | sudo apt-get install hadoop-0.20-conf-pseudo

# then check that everything is working correctly:
dpkg -L hadoop-0.20-conf-pseudo

# install scalding
echo "y" | sudo apt-get install scala
echo "y" | sudo apt-get install git
git clone https://github.com/twitter/scalding.git
cd scalding
./sbt update && ./sbt test && ./sbt assembly

# Format the NameNode
sudo -u hdfs hdfs namenode -format

# Start HDFS
for x in `cd /etc/init.d ; ls hadoop-hdfs-*` ; do sudo service $x start ; done

# Create the /tmp Directory
sudo -u hdfs hadoop fs -mkdir /tmp
sudo -u hdfs hadoop fs -chmod -R 1777 /tmp

# Create MapReduce system directories:
sudo -u hdfs hadoop fs -mkdir -p /var/lib/hadoop-hdfs/cache/mapred/mapred/staging
sudo -u hdfs hadoop fs -chmod 1777 /var/lib/hadoop-hdfs/cache/mapred/mapred/staging
sudo -u hdfs hadoop fs -chown -R mapred /var/lib/hadoop-hdfs/cache/mapred

# Verify HDFS File Structure
sudo -u hdfs hadoop fs -ls -R /

# Start MapReduce
for x in `cd /etc/init.d ; ls hadoop-0.20-mapreduce-*` ; do sudo service $x start ; done

# Create a Hadoop User directories
export HDUSER=$USER
sudo -u hdfs hadoop fs -mkdir /user/$HDUSER
sudo -u hdfs hadoop fs -chown $HDUSER /user/$HDUSER

# Test Hadoop
hadoop fs -mkdir input
hadoop fs -put /etc/hadoop/conf/*.xml input
hadoop fs -ls input
/usr/bin/hadoop jar /usr/lib/hadoop-0.20-mapreduce/hadoop-examples.jar grep input output 'dfs[a-z.]+'
hadoop fs -ls
hadoop fs -ls output
hadoop fs -cat output/part-00000 | head

# Since your account is behind a firewall has been cutoff, run these commands from your local machine
# they forward digital ocean's localhost to your local one for the job, task tracker and namenode.
# then visit localhost:50030 on a browser to see the results ...
# ssh -f -N -L 50030:localhost:50030 user@digitalocean
# ssh -f -N -L 50060:localhost:50060 user@digitalocean
# ssh -f -N -L 50070:localhost:50070 user@digitalocean
	# Instaling Ubuntu
	# http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH4/latest/CDH4-Quick-Start/cdh4qs_topic_3_2.html

	# While, we're at it, let's install the JDK ...
	echo "y" \| sudo apt-get install openjdk-6-jdk

	# ... and Yelp's mrjob
	pip install mrjob

	# Install maven
	echo "y" \| sudo apt-get install maven

	# download and install this Debian package from Cloudera.
	wget http://archive.cloudera.com/cdh4/one-click-install/lucid/amd64/cdh4-repository_1.0_all.deb
	sudo dpkg -i cdh4-repository_1.0_all.deb

	# add cloudera public key to repository
	curl -s http://archive.cloudera.com/cdh4/ubuntu/lucid/amd64/cdh/archive.key \| sudo apt-key add -

	# install hadoop
	echo "y" \| sudo apt-get update
	echo "y" \| sudo apt-get install hadoop-0.20-conf-pseudo

	# then check that everything is working correctly:
	dpkg -L hadoop-0.20-conf-pseudo

	# install scalding
	echo "y" \| sudo apt-get install scala
	echo "y" \| sudo apt-get install git
	git clone https://github.com/twitter/scalding.git
	cd scalding
	./sbt update && ./sbt test && ./sbt assembly

	# Format the NameNode
	sudo -u hdfs hdfs namenode -format

	# Start HDFS
	for x in `cd /etc/init.d ; ls hadoop-hdfs-*` ; do sudo service $x start ; done

	# Create the /tmp Directory
	sudo -u hdfs hadoop fs -mkdir /tmp
	sudo -u hdfs hadoop fs -chmod -R 1777 /tmp

	# Create MapReduce system directories:
	sudo -u hdfs hadoop fs -mkdir -p /var/lib/hadoop-hdfs/cache/mapred/mapred/staging
	sudo -u hdfs hadoop fs -chmod 1777 /var/lib/hadoop-hdfs/cache/mapred/mapred/staging
	sudo -u hdfs hadoop fs -chown -R mapred /var/lib/hadoop-hdfs/cache/mapred

	# Verify HDFS File Structure
	sudo -u hdfs hadoop fs -ls -R /

	# Start MapReduce
	for x in `cd /etc/init.d ; ls hadoop-0.20-mapreduce-*` ; do sudo service $x start ; done

	# Create a Hadoop User directories
	export HDUSER=$USER
	sudo -u hdfs hadoop fs -mkdir /user/$HDUSER
	sudo -u hdfs hadoop fs -chown $HDUSER /user/$HDUSER

	# Test Hadoop
	hadoop fs -mkdir input
	hadoop fs -put /etc/hadoop/conf/*.xml input
	hadoop fs -ls input
	/usr/bin/hadoop jar /usr/lib/hadoop-0.20-mapreduce/hadoop-examples.jar grep input output 'dfs[a-z.]+'
	hadoop fs -ls
	hadoop fs -ls output
	hadoop fs -cat output/part-00000 \| head

	# Since your account is behind a firewall has been cutoff, run these commands from your local machine
	# they forward digital ocean's localhost to your local one for the job, task tracker and namenode.
	# then visit localhost:50030 on a browser to see the results ...
	# ssh -f -N -L 50030:localhost:50030 user@digitalocean
	# ssh -f -N -L 50060:localhost:50060 user@digitalocean
	# ssh -f -N -L 50070:localhost:50070 user@digitalocean