Skip to content

Instantly share code, notes, and snippets.

@kwarodom
Last active June 12, 2017 19:50
Show Gist options
  • Save kwarodom/9638402e3eecd6c91d1cc6d447e5d37a to your computer and use it in GitHub Desktop.
Save kwarodom/9638402e3eecd6c91d1cc6d447e5d37a to your computer and use it in GitHub Desktop.
This is the instruction how to install hadoop on Ubuntu > 16.04 using in DEPA Big Data Training Class
#!/bin/bash
#This is the installation file for hadoop: HDFS, Yarn, MapReduce on Ubuntu 17.04 64-bit
#WARNING!!!! DO NOT RUN THIS FILE
#update package
sudo apt-get update
#setup SSH
sudo apt-get install –y openssh-server
ssh-keygen –t dsa –P ''
cat .ssh/id_dsa.pub >> .ssh/authorized_key
ssh localhost
#setup JAVA https://www.digitalocean.com/community/tutorials/how-to-install-java-with-apt-get-on-ubuntu-16-04
sudo add-apt-repository ppa:webupd8team/java
sudo apt-get update
sudo apt-get install oracle-java8-installer
#managing JAVA
sudo update-alternatives --config java
#Add environment variable at .bashrc
export JAVA_HOME=/usr/lib/jvm/java-8-oracle
export PATH=$PATH:$JAVA_HOME/bin
#setup hadoop
#1.download hadoop
wget http://mirror.cc.columbia.edu/pub/software/apache/hadoop/common/hadoop-2.7.2/hadoop-2.7.2.tar.gz
#2.create Folder for Hadoop
sudo mkdir –p /usr/local/hadoop
sudo chown -R arit:arit /usr/local/hadoop
sudo mkdir /var/log/hadoop
sudo chown –R arit:arit /var/log/hadoop
sudo mkdir –p /var/hadoop_data
sudo mkdir –p /var/hadoop_data/namenode
sudo mkdir –p /var/hadoop_data/datanode
sudo chown -R arit:arit /var/hadoop_data
#3.install hadoop
tar –xvf hadoop-2.7.2.tar.gz
sudo mv ./hadoop-2.7.2/* /usr/local/hadoop
sudo chown -R arit:arit /usr/local/hadoop
#4.setup hadoop variable environment
cd ~/
nano .bashrc
#TODO this has to be added to the file manually, though with bash script can be done
# export HADOOP_HOME=/usr/local/hadoop
# export PATH=$PATH:$HADOOP_HOME/bin
# export PATH=$PATH:$HADOOP_HOME/sbin
#5. edit hadoop-env.sh script to set java env
cd /usr/local/hadoop/etc/hadoop
nano hadoop-env.sh
#TODO this has to be added to the file manually, though with bash script can be done
# export JAVA_HOME=/usr/lib/jvm/java-8-oracle
# export HADOOP_LOG_DIR=/var/log/hadoop
#6. edit yarn-env.sh script to set yarn log dir
nano yarn-env.sh
#TODO this has to be added to the file manually, though with bash script can be done
# export YARN_LOG_DIR=/var/log/hadoop
#7. format hdfs file system
hdfs namenode -format
#8. start hadoop (at /usr/local/hadoop/etc/hadoop)
#8.1 start HDFS
start-dfs.sh
#TODO cheeck whether 2 processes: NameNode and DataNode are running
jps
#8.2 start YARN
start-yarn.sh
#TODO cheeck whether 2 processes: ResourceManager and NodeManager are running
jps
#9. to stop hadoop
stop-yarn.sh
stop-dfs.sh
#TODO check hadoop process
jps
#10. install MapReduce
cd /usr/local/hadoop/share/hadoop/mapreduce
hadoop jar hadoop-mapreduce-examples-2.7.2.jar pi 10 1000
sudo mkdir /usr/lab
sudo nano /usr/lab/wc-data.txt
sudo mkdir /usr/lab/output
#11. observe terminal screen
hadoop jar hadoop-mapreduce-examples-2.7.2.jar wordcount /usr/lab/wc-data.txt /usr/lab/output/o2
#12. after successfully installed HDFS, YARN, MapReduce, go to configure the following files in these directory
# Main Package Directory : /usr/local/hadoop
# Data Directory : /var/hadoop_data/namenode
# : /var/hadoop_data/datanode
# Log file Directory : /var/log/hadoop
cd /usr/local/hadoop/etc/hadoop
#12.1 core-site.xml
nano core-site.xml
#TODO add this
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
</property>
<property>
<name>hadoop.proxyuser.root.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.root.groups</name>
<value>*</value>
</property>
</configuration>
#12.2 hdfs-site.xml
nano hdfs-site.xml
#TODO add this
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>file:/var/hadoop_data/namenode</value>
</property>
<property>
<name>dfs.namenode.data.dir</name>
<value>file:/var/hadoop_data/datanode</value>
</property>
<property>
<name>dfs.namenode.acls.enabled</name>
<value>true</value>
</property>
</configuration>
#12.3 yarn-site.xml
nano yarn-site.xml
#TODO add this
<configuration>
<!-- Site specific YARN configuration properties -->
<property>
<name>yarn.resourcemanager.hostname</name>
<value>localhost</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address</name>
<value>localhost:8030</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address</name>
<value>localhost:8031</value>
</property>
<property>
<name>yarn.resourcemanager.address</name>
<value>localhost:8032</value>
</property>
<property>
<name>yarn.resourcemanager.admin.address</name>
<value>localhost:8033</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address</name>
<value>localhost:8088</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.aux-services.mapreduce_shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
</configuration>
#12.4 mapred-site.xml
cp mapred-site.xml.template mapred-site.xml
nano mapred-site.xml
#TODO add this
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>
#13. Initial HDFS : HDFS Formatting
hadoop namenode -format
#14. start hadoop (at /usr/local/hadoop/etc/hadoop)
#14.1 start HDFS
start-dfs.sh
#TODO cheeck whether 2 processes: NameNode and DataNode are running
jps
#14.2 start YARN
start-yarn.sh
#TODO cheeck whether 2 processes: ResourceManager and NodeManager are running
jps
#15. to stop hadoop
stop-yarn.sh
stop-dfs.sh
#TODO check hadoop process
jps
#16. Post Test Installation
http://< IP >:50070/
http://< IP >:8088
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment