avigail-oron/Setup Hadoop3 cluster

## Setup Hadoop3 cluster
OS: ubuntu 16.04

//To run on each node:
//====================

//Prerequisistes
//--------------
sudo apt install default-jdk
//verify java installed
java -version
//create dedicated user for hadoop
sudo addgroup hadoop
sudo adduser --ingroup hadoop hduser
sudo usermod -aG sudo hduser
//configure ssh, used by hadoop for inter cluster comm
//generate an SSH key for the hduser user:
su - hduser
ssh-keygen -t rsa -P ""
//enable access to ssh via the generated key
cat $HOME/.ssh/id_rsa.pub >> $HOME/.ssh/authorized_keys
//test the connection
ssh localhost

//Install Hadoop
//--------------
//Download hadoop binary distro from here: http://www.apache.org/dyn/closer.cgi/hadoop/core
//Or copy your own built distro
//at any case, place the folder under /usr/local dir
sudo chown -R hduser:hadoop hadoop
//edit /usr/local/hadoop/etc/hadoop/hadoop-env.sh and set JAVA_HOME to point to /usr

//HDFS
//-----
sudo mkdir -p /app/hadoop/tmp
sudo chown hduser:hadoop /app/hadoop/tmp
cd /usr/local/hadoop
//edit etc/hadoop/core-site.xml and add inside the <configuration></configuration> section:

<property>
  <name>hadoop.tmp.dir</name>
  <value>/app/hadoop/tmp</value>
  <description>A base for other temporary directories.</description>
</property>
<property>
  <name>fs.default.name</name>
  <value>hdfs://localhost:54310</value>
  <description>The name of the default file system.  A URI whose
  scheme and authority determine the FileSystem implementation.  The
  uri's scheme determines the config property (fs.SCHEME.impl) naming
  the FileSystem implementation class.  The uri's authority is used to
  determine the host, port, etc. for a filesystem.</description>
</property>

//Add to same place in file etc/hadoop/mapred-site.xml:

<property>
  <name>mapred.job.tracker</name>
  <value>localhost:54311</value>
  <description>The host and port that the MapReduce job tracker runs
  at.  If "local", then jobs are run in-process as a single map
  and reduce task.
  </description>
</property>

//in etc/hadoop/hdfs-site.xml:

<property>
  <name>dfs.replication</name>
  <value>1</value>
  <description>Default block replication.
  The actual number of replications can be specified when the file is created.
  The default is used if replication is not specified in create time.
  </description>
</property>


/usr/local/hadoop/bin/hadoop namenode -format
//start HDFS:
sbin/start-dfs.sh
//start yarn:
sbin/start-yarn.sh

//run jps command to see whether hadoop java processes are up & running.
//you should have the following on each node:
NameNode
SecondaryNameNode
NodeManager
DataNode
ResourceManager

//Run a MapReduce job to test this node
//--------------------------------------
//download a book to use as input:
sudo mkdir /tmp/gutenberg
cd /tmp/gutenberg
sudo wget  http://www.gutenberg.org/files/5000/5000-8.txt
//copy text file to HDFS:
cd /usr/local/hadoop
bin/hadoop fs -mkdir -p /user/hduser/gutenberg
bin/hadoop fs -put /tmp/gutenberg /user/hduser/
//run WordCount example job. replace version number if needed:
bin/hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.0-SNAPSHOT.jar wordcount /user/hduser/gutenberg /user/hduser/gutenberg-output
//view the result file in HDFS:
bin/hadoop fs -cat /user/hduser/gutenberg-output/part-r-00000


//Join the 2 nodes to function as a cluster
//------------------------------------------
//stop all  hadoop processes on all nodes
sbin/stop-all.sh
//check with jps that all hadoop processes are down

//decide which machine will be the master and which the slave
//in my case
//master is 192.168.122.187
//slave is 192.168.122.178

//allow the master to ssh to slave w/o password.
//To do so run the following command on master (replace with your slave IP):
ssh-copy-id -i $HOME/.ssh/id_rsa.pub hduser@192.168.122.178

//on all nodes in the cluster, edit the following files:

etc/hadoop/core-site.xml
//edit the value of fs.default.name to the master's ip

etc/hadoop/yarn-site.xml
//make sure the configuration block includes this section (replace with your master IP):
<configuration>

<!-- Site specific YARN configuration properties -->
<property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>
    <property>
            <name>yarn.resourcemanager.hostname</name>
            <value>192.168.122.187</value>
    </property>
</configuration>

etc/hadoop/mapred-site.xml
//make sure the configuration block is as following, replace with your master IP:
<configuration>
<property>
  <name>mapred.job.tracker</name>
  <value>192.168.122.187:54311</value>
  <description>The host and port that the MapReduce job tracker runs
  at.  If "local", then jobs are run in-process as a single map
  and reduce task.
  </description>
</property>
<property>
  <name>mapreduce.framework.name</name>
  <value>yarn</value>
</property>
</configuration>

//edit workers file and list all workers - each in a dedicated line (master is a worker as a well, since it has a worker running)
//in my case:
192.168.122.178
192.168.122.187

//now format the HDFS file system:
//run on master only:
bin/hadoop namenode -format

//start all hadoop processes
//launch on master:
sbin/start-dfs.sh
sbin/start-yarn.sh

//verify using jps that all processes are launched cross the cluster as expected.


//Resources:
//----------
Old guide on setting single node cluster: http://www.michael-noll.com/tutorials/running-hadoop-on-ubuntu-linux-single-node-cluster/
Old guide on transforming several single-node clusters into a multinode cluster: http://www.michael-noll.com/tutorials/running-hadoop-on-ubuntu-linux-multi-node-cluster/
Newer article on single node cluster setup: https://www.digitalocean.com/community/tutorials/how-to-install-hadoop-in-stand-alone-mode-on-ubuntu-16-04
Formal apache guide on single node setup: https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/SingleCluster.html
Formal apache guide on multinode setup: https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/ClusterSetup.html
	OS: ubuntu 16.04

	//To run on each node:
	//====================

	//Prerequisistes
	//--------------
	sudo apt install default-jdk
	//verify java installed
	java -version
	//create dedicated user for hadoop
	sudo addgroup hadoop
	sudo adduser --ingroup hadoop hduser
	sudo usermod -aG sudo hduser
	//configure ssh, used by hadoop for inter cluster comm
	//generate an SSH key for the hduser user:
	su - hduser
	ssh-keygen -t rsa -P ""
	//enable access to ssh via the generated key
	cat $HOME/.ssh/id_rsa.pub >> $HOME/.ssh/authorized_keys
	//test the connection
	ssh localhost

	//Install Hadoop
	//--------------
	//Download hadoop binary distro from here: http://www.apache.org/dyn/closer.cgi/hadoop/core
	//Or copy your own built distro
	//at any case, place the folder under /usr/local dir
	sudo chown -R hduser:hadoop hadoop
	//edit /usr/local/hadoop/etc/hadoop/hadoop-env.sh and set JAVA_HOME to point to /usr

	//HDFS
	//-----
	sudo mkdir -p /app/hadoop/tmp
	sudo chown hduser:hadoop /app/hadoop/tmp
	cd /usr/local/hadoop
	//edit etc/hadoop/core-site.xml and add inside the <configuration></configuration> section:

	<property>
	<name>hadoop.tmp.dir</name>
	<value>/app/hadoop/tmp</value>
	<description>A base for other temporary directories.</description>
	</property>
	<property>
	<name>fs.default.name</name>
	<value>hdfs://localhost:54310</value>
	<description>The name of the default file system. A URI whose
	scheme and authority determine the FileSystem implementation. The
	uri's scheme determines the config property (fs.SCHEME.impl) naming
	the FileSystem implementation class. The uri's authority is used to
	determine the host, port, etc. for a filesystem.</description>
	</property>

	//Add to same place in file etc/hadoop/mapred-site.xml:

	<property>
	<name>mapred.job.tracker</name>
	<value>localhost:54311</value>
	<description>The host and port that the MapReduce job tracker runs
	at. If "local", then jobs are run in-process as a single map
	and reduce task.
	</description>
	</property>

	//in etc/hadoop/hdfs-site.xml:

	<property>
	<name>dfs.replication</name>
	<value>1</value>
	<description>Default block replication.
	The actual number of replications can be specified when the file is created.
	The default is used if replication is not specified in create time.
	</description>
	</property>


	/usr/local/hadoop/bin/hadoop namenode -format
	//start HDFS:
	sbin/start-dfs.sh
	//start yarn:
	sbin/start-yarn.sh

	//run jps command to see whether hadoop java processes are up & running.
	//you should have the following on each node:
	NameNode
	SecondaryNameNode
	NodeManager
	DataNode
	ResourceManager

	//Run a MapReduce job to test this node
	//--------------------------------------
	//download a book to use as input:
	sudo mkdir /tmp/gutenberg
	cd /tmp/gutenberg
	sudo wget http://www.gutenberg.org/files/5000/5000-8.txt
	//copy text file to HDFS:
	cd /usr/local/hadoop
	bin/hadoop fs -mkdir -p /user/hduser/gutenberg
	bin/hadoop fs -put /tmp/gutenberg /user/hduser/
	//run WordCount example job. replace version number if needed:
	bin/hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.0-SNAPSHOT.jar wordcount /user/hduser/gutenberg /user/hduser/gutenberg-output
	//view the result file in HDFS:
	bin/hadoop fs -cat /user/hduser/gutenberg-output/part-r-00000


	//Join the 2 nodes to function as a cluster
	//------------------------------------------
	//stop all hadoop processes on all nodes
	sbin/stop-all.sh
	//check with jps that all hadoop processes are down

	//decide which machine will be the master and which the slave
	//in my case
	//master is 192.168.122.187
	//slave is 192.168.122.178

	//allow the master to ssh to slave w/o password.
	//To do so run the following command on master (replace with your slave IP):
	ssh-copy-id -i $HOME/.ssh/id_rsa.pub hduser@192.168.122.178

	//on all nodes in the cluster, edit the following files:

	etc/hadoop/core-site.xml
	//edit the value of fs.default.name to the master's ip

	etc/hadoop/yarn-site.xml
	//make sure the configuration block includes this section (replace with your master IP):
	<configuration>

	<!-- Site specific YARN configuration properties -->
	<property>
	<name>yarn.nodemanager.aux-services</name>
	<value>mapreduce_shuffle</value>
	</property>
	<property>
	<name>yarn.resourcemanager.hostname</name>
	<value>192.168.122.187</value>
	</property>
	</configuration>

	etc/hadoop/mapred-site.xml
	//make sure the configuration block is as following, replace with your master IP:
	<configuration>
	<property>
	<name>mapred.job.tracker</name>
	<value>192.168.122.187:54311</value>
	<description>The host and port that the MapReduce job tracker runs
	at. If "local", then jobs are run in-process as a single map
	and reduce task.
	</description>
	</property>
	<property>
	<name>mapreduce.framework.name</name>
	<value>yarn</value>
	</property>
	</configuration>

	//edit workers file and list all workers - each in a dedicated line (master is a worker as a well, since it has a worker running)
	//in my case:
	192.168.122.178
	192.168.122.187

	//now format the HDFS file system:
	//run on master only:
	bin/hadoop namenode -format

	//start all hadoop processes
	//launch on master:
	sbin/start-dfs.sh
	sbin/start-yarn.sh

	//verify using jps that all processes are launched cross the cluster as expected.










	//Resources:
	//----------
	Old guide on setting single node cluster: http://www.michael-noll.com/tutorials/running-hadoop-on-ubuntu-linux-single-node-cluster/
	Old guide on transforming several single-node clusters into a multinode cluster: http://www.michael-noll.com/tutorials/running-hadoop-on-ubuntu-linux-multi-node-cluster/
	Newer article on single node cluster setup: https://www.digitalocean.com/community/tutorials/how-to-install-hadoop-in-stand-alone-mode-on-ubuntu-16-04
	Formal apache guide on single node setup: https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/SingleCluster.html
	Formal apache guide on multinode setup: https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/ClusterSetup.html