javadba/gist:5078682

## gistfile1.txt
# Assumes you have Sun JDK installed already and JAVA_HOME set to that for root
# This is all basically a summary of various parts of https://ccp.cloudera.com/display/CDH4DOC/CDH4+Documentation

# Add Cloudera RPM-GPG-KEY and repo
rpm --import http://archive.cloudera.com/cdh4/redhat/6/x86_64/cdh/RPM-GPG-KEY-cloudera
rpm -ivh http://archive.cloudera.com/cdh4/one-click-install/redhat/6/x86_64/cloudera-cdh-4-0.x86_64.rpm

# Install CDH4 Base
yum install hadoop-0.20-conf-pseudo

# set JAVA_HOME in /etc/hadoop/conf/hadoop-env.sh if you can't use the system-wide Java runtime

# Update fs.default.name value to actual <hostname> in /etc/hadoop/conf/core-site.xml
sed -i 's/localhost/<hostname>/' /etc/hadoop/conf/core-site.xml

# Update mapred.job.tracker value to actual <hostname> in /etc/hadoop/conf/mapred-site.xml
sed -i 's/localhost/<hostname>/' /etc/hadoop/conf/mapred-site.xml

# Format the NameNode
sudo -u hdfs hdfs namenode -format

# Start HDFS
for service in /etc/init.d/hadoop-hdfs-* ; do $service start; done

# do all of the following as the HDFS user
su - hdfs

# Create the HDFS /tmp directory before someone else does it and gets the perms wrong
hadoop fs -mkdir /tmp
hadoop fs -chmod -R 1777 /tmp

# Create and permission the MapReduce system directories
hadoop fs -mkdir /var
hadoop fs -mkdir /var/lib
hadoop fs -mkdir /var/lib/hadoop-hdfs
hadoop fs -mkdir /var/lib/hadoop-hdfs/cache
hadoop fs -mkdir /var/lib/hadoop-hdfs/cache/mapred
hadoop fs -mkdir /var/lib/hadoop-hdfs/cache/mapred/mapred
hadoop fs -mkdir /var/lib/hadoop-hdfs/cache/mapred/mapred/staging
hadoop fs -chmod 1777 /var/lib/hadoop-hdfs/cache/mapred/mapred/staging
hadoop fs -chown -R mapred /var/lib/hadoop-hdfs/cache/mapred

# Verify the HDFS File Structure
hadoop fs -ls -R /

# Should look as follows:
# drwxrwxrwt   - hdfs     supergroup          0 2012-04-19 15:14 /tmp
# drwxr-xr-x   - hdfs     supergroup          0 2012-04-19 15:16 /var
# drwxr-xr-x   - hdfs     supergroup          0 2012-04-19 15:16 /var/lib
# drwxr-xr-x   - hdfs     supergroup          0 2012-04-19 15:16 /var/lib/hadoop-hdfs
# drwxr-xr-x   - hdfs     supergroup          0 2012-04-19 15:16 /var/lib/hadoop-hdfs/cache
# drwxr-xr-x   - mapred   supergroup          0 2012-04-19 15:19 /var/lib/hadoop-hdfs/cache/mapred
# drwxr-xr-x   - mapred   supergroup          0 2012-04-19 15:29 /var/lib/hadoop-hdfs/cache/mapred/mapred
# drwxrwxrwt   - mapred   supergroup          0 2012-04-19 15:33 /var/lib/hadoop-hdfs/cache/mapred/mapred/staging

# change back to rut
exit

# Start MapReduce
for service in /etc/init.d/hadoop-0.20-mapreduce-*; do $service start; done

# Check everything worked, run '$JAVA_HOME/bin/jps' and look for the following processes
DataNode
NameNode
SecondaryNameNode
JobTracker
TaskTracker

#
# Zookeeper
#

# Install zookeeper server package
# Base package was already installed by hadoop-0.20-conf-pseudo
yum install zookeeper-server

# set JAVA_HOME in /usr/lib/zookeeper/bin/zkEnv.sh if you can't use the system-wide Java runtime

# Initialise Zookeeper
service zookeeper-server init --myid=1

# Start zookeeper
service zookeeper-server start

# Check everything worked, run '$JAVA_HOME/bin/jps' and look for the following process
QuorumPeerMain

#
# HBase
#

# set dfs.datanode.max.xcievers in /etc/hadoop/conf/hdfs-site.xml (yes, it's actually misspelled)
# Insert the following XML property between the <configuration> and </configuration> tags
<property>
  <name>dfs.datanode.max.xcievers</name>
  <value>4096</value>
</property>

# restart hdfs
for service in /etc/init.d/hadoop-hdfs-* ; do $service restart; done

# Install HBase
yum install hbase-master hbase-regionserver

# Modify /etc/hbase/conf/hbase-site.xml
# Be sure to change <hostname> to your actual hostname
# Insert the following XML properties between the <configuration> and </configuration> tags
<property>
  <name>hbase.cluster.distributed</name>
  <value>true</value>
</property>
<property>
  <name>hbase.rootdir</name>
  <value>hdfs://<hostname>:8020/hbase</value>
</property>

# Create the /hbase directory in HDFS
sudo -u hdfs hadoop fs -mkdir /hbase
sudo -u hdfs hadoop fs -chown hbase /hbase

# set JAVA_HOME in /etc/hbase/conf/hbase-env.sh if you can't use the system-wide Java runtime

# Start HBase master
service hbase-master start

# start region server
service hbase-regionserver start

# Check everything worked, run '$JAVA_HOME/bin/jps' and look for the following processes
HRegionServer
HMaster

#
# we're done, running '$JAVA_HOME/bin/jps | sort' should show all of the following processes (order doesn't matter)
#
JobTracker
TaskTracker
QuorumPeerMain
DataNode
NameNode
SecondaryNameNode
HMaster
HRegionServer

#
# Additional notes
#

#
# - User setup
# Create Linux users and corresponding HDFS home directories as needed
useradd -m -U <user>
sudo -u hdfs hadoop fs -mkdir  /user/<user>
sudo -u hdfs hadoop fs -chown <user> /user/<user>

# - Shutting down / Starting up
# Order matters! To shutdown, do the following:
for service in /etc/init.d/hbase-*; do $service stop; done
service zookeeper-server stop
for service in /etc/init.d/hadoop-0.20-mapreduce-*; do $service stop; done
for service in /etc/init.d/hadoop-hdfs-* ; do $service stop; done
#
# Then to start back up:
#
for service in /etc/init.d/hadoop-hdfs-* ; do $service start; done
for service in /etc/init.d/hadoop-0.20-mapreduce-*; do $service start; done
service zookeeper-server start
for service in /etc/init.d/hbase-*; do $service start; done

# - Disk Space
# Depending on your usage and your Linux environment, you may run out of disk space quickly.
# To fix this, moving /var/lib/hadoop-hdfs onto a dedicated file system is a good start.
# Doing the same for /var/log/hadoop-hdfs isn't a bad idea either.
# Make sure you retain (or duplicate) the exact same file system permissions in the new location if you do this.

# - Hadoop service accounts
# If you want to be picky with the GID/UID's of the Hadoop service accounts then do the following before running the Cloudera RPM's... otherwise just let the RPM's do it for you.
# Common Hadoop group
groupadd --gid 6666 hadoop
# Map Reduce
groupadd --gid 6667 mapred
useradd --no-create-home --home-dir /usr/lib/hadoop-0.20-mapreduce --shell /bin/bash --uid 6667 --gid mapred --groups hadoop --comment "Hadoop MapReduce" mapred
# HDFS
groupadd --gid 6668 hdfs
useradd --no-create-home --home-dir /usr/lib/hadoop-hdfs --shell /bin/bash --uid 6668 --gid hdfs --groups hadoop --comment "Hadoop HDFS" hdfs
# Zookeeper
groupadd --gid 6669 zookeeper
useradd --no-create-home --home-dir /var/run/zookeeper --shell /sbin/nologin --uid 6669 --gid zookeeper --comment "Zookeeper" zookeeper
# HBase
groupadd --gid 6670 hbase
useradd --no-create-home --home-dir /var/run/hbase --shell /sbin/nologin --uid 6670 --gid hbase --comment "HBase" hbase
	# Assumes you have Sun JDK installed already and JAVA_HOME set to that for root
	# This is all basically a summary of various parts of https://ccp.cloudera.com/display/CDH4DOC/CDH4+Documentation

	# Add Cloudera RPM-GPG-KEY and repo
	rpm --import http://archive.cloudera.com/cdh4/redhat/6/x86_64/cdh/RPM-GPG-KEY-cloudera
	rpm -ivh http://archive.cloudera.com/cdh4/one-click-install/redhat/6/x86_64/cloudera-cdh-4-0.x86_64.rpm

	# Install CDH4 Base
	yum install hadoop-0.20-conf-pseudo

	# set JAVA_HOME in /etc/hadoop/conf/hadoop-env.sh if you can't use the system-wide Java runtime

	# Update fs.default.name value to actual <hostname> in /etc/hadoop/conf/core-site.xml
	sed -i 's/localhost/<hostname>/' /etc/hadoop/conf/core-site.xml

	# Update mapred.job.tracker value to actual <hostname> in /etc/hadoop/conf/mapred-site.xml
	sed -i 's/localhost/<hostname>/' /etc/hadoop/conf/mapred-site.xml

	# Format the NameNode
	sudo -u hdfs hdfs namenode -format

	# Start HDFS
	for service in /etc/init.d/hadoop-hdfs-* ; do $service start; done

	# do all of the following as the HDFS user
	su - hdfs

	# Create the HDFS /tmp directory before someone else does it and gets the perms wrong
	hadoop fs -mkdir /tmp
	hadoop fs -chmod -R 1777 /tmp

	# Create and permission the MapReduce system directories
	hadoop fs -mkdir /var
	hadoop fs -mkdir /var/lib
	hadoop fs -mkdir /var/lib/hadoop-hdfs
	hadoop fs -mkdir /var/lib/hadoop-hdfs/cache
	hadoop fs -mkdir /var/lib/hadoop-hdfs/cache/mapred
	hadoop fs -mkdir /var/lib/hadoop-hdfs/cache/mapred/mapred
	hadoop fs -mkdir /var/lib/hadoop-hdfs/cache/mapred/mapred/staging
	hadoop fs -chmod 1777 /var/lib/hadoop-hdfs/cache/mapred/mapred/staging
	hadoop fs -chown -R mapred /var/lib/hadoop-hdfs/cache/mapred

	# Verify the HDFS File Structure
	hadoop fs -ls -R /

	# Should look as follows:
	# drwxrwxrwt - hdfs supergroup 0 2012-04-19 15:14 /tmp
	# drwxr-xr-x - hdfs supergroup 0 2012-04-19 15:16 /var
	# drwxr-xr-x - hdfs supergroup 0 2012-04-19 15:16 /var/lib
	# drwxr-xr-x - hdfs supergroup 0 2012-04-19 15:16 /var/lib/hadoop-hdfs
	# drwxr-xr-x - hdfs supergroup 0 2012-04-19 15:16 /var/lib/hadoop-hdfs/cache
	# drwxr-xr-x - mapred supergroup 0 2012-04-19 15:19 /var/lib/hadoop-hdfs/cache/mapred
	# drwxr-xr-x - mapred supergroup 0 2012-04-19 15:29 /var/lib/hadoop-hdfs/cache/mapred/mapred
	# drwxrwxrwt - mapred supergroup 0 2012-04-19 15:33 /var/lib/hadoop-hdfs/cache/mapred/mapred/staging

	# change back to rut
	exit

	# Start MapReduce
	for service in /etc/init.d/hadoop-0.20-mapreduce-*; do $service start; done

	# Check everything worked, run '$JAVA_HOME/bin/jps' and look for the following processes
	DataNode
	NameNode
	SecondaryNameNode
	JobTracker
	TaskTracker

	#
	# Zookeeper
	#

	# Install zookeeper server package
	# Base package was already installed by hadoop-0.20-conf-pseudo
	yum install zookeeper-server

	# set JAVA_HOME in /usr/lib/zookeeper/bin/zkEnv.sh if you can't use the system-wide Java runtime

	# Initialise Zookeeper
	service zookeeper-server init --myid=1

	# Start zookeeper
	service zookeeper-server start

	# Check everything worked, run '$JAVA_HOME/bin/jps' and look for the following process
	QuorumPeerMain

	#
	# HBase
	#

	# set dfs.datanode.max.xcievers in /etc/hadoop/conf/hdfs-site.xml (yes, it's actually misspelled)
	# Insert the following XML property between the <configuration> and </configuration> tags
	<property>
	<name>dfs.datanode.max.xcievers</name>
	<value>4096</value>
	</property>

	# restart hdfs
	for service in /etc/init.d/hadoop-hdfs-* ; do $service restart; done

	# Install HBase
	yum install hbase-master hbase-regionserver

	# Modify /etc/hbase/conf/hbase-site.xml
	# Be sure to change <hostname> to your actual hostname
	# Insert the following XML properties between the <configuration> and </configuration> tags
	<property>
	<name>hbase.cluster.distributed</name>
	<value>true</value>
	</property>
	<property>
	<name>hbase.rootdir</name>
	<value>hdfs://<hostname>:8020/hbase</value>
	</property>

	# Create the /hbase directory in HDFS
	sudo -u hdfs hadoop fs -mkdir /hbase
	sudo -u hdfs hadoop fs -chown hbase /hbase

	# set JAVA_HOME in /etc/hbase/conf/hbase-env.sh if you can't use the system-wide Java runtime

	# Start HBase master
	service hbase-master start

	# start region server
	service hbase-regionserver start

	# Check everything worked, run '$JAVA_HOME/bin/jps' and look for the following processes
	HRegionServer
	HMaster

	#
	# we're done, running '$JAVA_HOME/bin/jps \| sort' should show all of the following processes (order doesn't matter)
	#
	JobTracker
	TaskTracker
	QuorumPeerMain
	DataNode
	NameNode
	SecondaryNameNode
	HMaster
	HRegionServer

	#
	# Additional notes
	#

	#
	# - User setup
	# Create Linux users and corresponding HDFS home directories as needed
	useradd -m -U <user>
	sudo -u hdfs hadoop fs -mkdir /user/<user>
	sudo -u hdfs hadoop fs -chown <user> /user/<user>

	# - Shutting down / Starting up
	# Order matters! To shutdown, do the following:
	for service in /etc/init.d/hbase-*; do $service stop; done
	service zookeeper-server stop
	for service in /etc/init.d/hadoop-0.20-mapreduce-*; do $service stop; done
	for service in /etc/init.d/hadoop-hdfs-* ; do $service stop; done
	#
	# Then to start back up:
	#
	for service in /etc/init.d/hadoop-hdfs-* ; do $service start; done
	for service in /etc/init.d/hadoop-0.20-mapreduce-*; do $service start; done
	service zookeeper-server start
	for service in /etc/init.d/hbase-*; do $service start; done

	# - Disk Space
	# Depending on your usage and your Linux environment, you may run out of disk space quickly.
	# To fix this, moving /var/lib/hadoop-hdfs onto a dedicated file system is a good start.
	# Doing the same for /var/log/hadoop-hdfs isn't a bad idea either.
	# Make sure you retain (or duplicate) the exact same file system permissions in the new location if you do this.

	# - Hadoop service accounts
	# If you want to be picky with the GID/UID's of the Hadoop service accounts then do the following before running the Cloudera RPM's... otherwise just let the RPM's do it for you.
	# Common Hadoop group
	groupadd --gid 6666 hadoop
	# Map Reduce
	groupadd --gid 6667 mapred
	useradd --no-create-home --home-dir /usr/lib/hadoop-0.20-mapreduce --shell /bin/bash --uid 6667 --gid mapred --groups hadoop --comment "Hadoop MapReduce" mapred
	# HDFS
	groupadd --gid 6668 hdfs
	useradd --no-create-home --home-dir /usr/lib/hadoop-hdfs --shell /bin/bash --uid 6668 --gid hdfs --groups hadoop --comment "Hadoop HDFS" hdfs
	# Zookeeper
	groupadd --gid 6669 zookeeper
	useradd --no-create-home --home-dir /var/run/zookeeper --shell /sbin/nologin --uid 6669 --gid zookeeper --comment "Zookeeper" zookeeper
	# HBase
	groupadd --gid 6670 hbase
	useradd --no-create-home --home-dir /var/run/hbase --shell /sbin/nologin --uid 6670 --gid hbase --comment "HBase" hbase