jwhiteman/gist:7634421

## gistfile1.sh
sudo apt-get update
sudo apt-get install vim

# generate keys and copy them over
ssh-copy-id -i id_rsa.pub vagrant@hnclient # move node ssh key to the client
# update /etc/hosts on each node to know at least the master; the master about the slave nodes
# make sure master can ssh into localost
cat .ssh/id_rsa.pub >> .ssh/authorized_keys

# re-read: http://stackoverflow.com/questions/8872807/hadoop-datanodes-cannot-find-namenode
# on both master & slave make sure that /etc/hosts is rocking
# and that on master, when you do `netstat -tuplen` that the namenode is bound to the "external" ip and not loopback / 127..

sudo apt-get install openjdk-7-jdk <<-EOF
yes
EOF


wget http://mirror.nexcess.net/apache/hadoop/common/hadoop-1.2.1/hadoop-1.2.1-bin.tar.gz

tar zxvf hadoop-1.2.1-bin.tar.gz

# edit bash rc file
export HADOOP_PREFIX=/usr/local/hadoop
export PATH=$PATH:$HADOOP_PREFIX/bin

sudo vim /usr/local/hadoop/conf/hadoop-env.sh # put in the following...
  export JAVA_HOME=/usr/lib/jvm/java-7-openjdk-amd64/
  export HADOOP_OPTS=-Djava.net.preferIPv4Stack=true  # disable IPv6 because it doesn't place nice w/ Hadoop

vim /usr/local/hadoop/conf/core-site.xml # put in the following
<<-HERE
<configuration>
  <property>
    <name>fs.default.name</name>
    <value>hdfs://hnname:10001</value>
  </property>

  <property>
    <name>hadoop.tmp.dir</name>
    <value>/usr/local/hadoop/tmp</value>
  </property>

</configuration>
HERE

# watch the xml. don't accidentally call `name` `key`
# make sure the host and port names are mispelled ie hname for hnname

sudo vim /usr/local/hadoop/conf/mapred-site.xml
<<-HERE
<configuration>
  <property>
    <name>mapred.job.tracker</name>
    <value>hnname:10002</value>
  </property>
</configuration>
HERE

sudo rm -rf /usr/local/hadoop/tmp
sudo rm -rf /usr/local/hadoop/logs
sudo mkdir /usr/local/hadoop/tmp
sudo mkdir /usr/local/hadoop/logs
sudo chown vagrant:vagrant /usr/local/hadoop/tmp
sudo chown vagrant:vagrant /usr/local/hadoop/logs

hadoop namenode -format
start-all.sh

# if you get errors because it can't open logs...
sudo mkdir /usr/local/hadoop/logs/
sudo chown vagrant /usr/local/hadoop/logs/
hadoop namenode -format # refresh w/ this
start-all.sh

# visit the master server gui at masterhost:50070

update masters
updates slaves

# test w/ hadoop jar /usr/local/hadoop/hadoop-examples-1.2.1.jar wordcount /data/words.txt /data/results
# hadoop dfs -cat /fubar/results/part-r-00000
	sudo apt-get update
	sudo apt-get install vim

	# generate keys and copy them over
	ssh-copy-id -i id_rsa.pub vagrant@hnclient # move node ssh key to the client
	# update /etc/hosts on each node to know at least the master; the master about the slave nodes
	# make sure master can ssh into localost
	cat .ssh/id_rsa.pub >> .ssh/authorized_keys

	# re-read: http://stackoverflow.com/questions/8872807/hadoop-datanodes-cannot-find-namenode
	# on both master & slave make sure that /etc/hosts is rocking
	# and that on master, when you do `netstat -tuplen` that the namenode is bound to the "external" ip and not loopback / 127..

	sudo apt-get install openjdk-7-jdk <<-EOF
	yes
	EOF


	wget http://mirror.nexcess.net/apache/hadoop/common/hadoop-1.2.1/hadoop-1.2.1-bin.tar.gz

	tar zxvf hadoop-1.2.1-bin.tar.gz

	# edit bash rc file
	export HADOOP_PREFIX=/usr/local/hadoop
	export PATH=$PATH:$HADOOP_PREFIX/bin

	sudo vim /usr/local/hadoop/conf/hadoop-env.sh # put in the following...
	export JAVA_HOME=/usr/lib/jvm/java-7-openjdk-amd64/
	export HADOOP_OPTS=-Djava.net.preferIPv4Stack=true # disable IPv6 because it doesn't place nice w/ Hadoop

	vim /usr/local/hadoop/conf/core-site.xml # put in the following
	<<-HERE
	<configuration>
	<property>
	<name>fs.default.name</name>
	<value>hdfs://hnname:10001</value>
	</property>

	<property>
	<name>hadoop.tmp.dir</name>
	<value>/usr/local/hadoop/tmp</value>
	</property>

	</configuration>
	HERE

	# watch the xml. don't accidentally call `name` `key`
	# make sure the host and port names are mispelled ie hname for hnname

	sudo vim /usr/local/hadoop/conf/mapred-site.xml
	<<-HERE
	<configuration>
	<property>
	<name>mapred.job.tracker</name>
	<value>hnname:10002</value>
	</property>
	</configuration>
	HERE

	sudo rm -rf /usr/local/hadoop/tmp
	sudo rm -rf /usr/local/hadoop/logs
	sudo mkdir /usr/local/hadoop/tmp
	sudo mkdir /usr/local/hadoop/logs
	sudo chown vagrant:vagrant /usr/local/hadoop/tmp
	sudo chown vagrant:vagrant /usr/local/hadoop/logs

	hadoop namenode -format
	start-all.sh

	# if you get errors because it can't open logs...
	sudo mkdir /usr/local/hadoop/logs/
	sudo chown vagrant /usr/local/hadoop/logs/
	hadoop namenode -format # refresh w/ this
	start-all.sh

	# visit the master server gui at masterhost:50070

	update masters
	updates slaves

	# test w/ hadoop jar /usr/local/hadoop/hadoop-examples-1.2.1.jar wordcount /data/words.txt /data/results
	# hadoop dfs -cat /fubar/results/part-r-00000