ravyg/Hadoop Commands

## Hadoop Commands
# Command pipeline
# Non hadoop way using only linux commands.
cat data | map | sort | reduce
# some text to mapper.
echo "foo foo quux labs foo bar quux" | <dirto/>mapper.py
# Some text to mapper then sort and then reducer.
echo "foo foo quux labs foo bar quux" | <dirto/>mapper.py | sort-K1, 1 | <dirto/>reducer.py
# Doing with file.
cat myfilename.txt | <dirto/>mapper.py | sort-K1, 1 | <dirto/>reducer.py

# hadoop specific commands
# use shell script to start/stop hadoop.
# Copy the file to Hadoop
# HDFS:
>> hadoop dfs -copyFromLocal /my/tmp/location /user/hduser/guttenburg

>> hduser@ubuntu:/usr/local/hadoop$ bin/hadoop jar contrib/streaming/hadoop-*streaming*.jar \
-file /home/hduser/mapper.py    -mapper /home/hduser/mapper.py \
-file /home/hduser/reducer.py   -reducer /home/hduser/reducer.py \
-input /user/hduser/gutenberg/* -output /user/hduser/gutenberg-output

## Install gutenberg XOS
For macOS
# Install homebrew
Install berkeley-db4 through brew
# Don't install using "berkeley-db" latest version it requires license.
brew install berkeley-db4
#Set the BERKELEY_DB environment variable to point to the brew installation before running pip install bsddb3
export BERKELEYDB_DIR=/usr/local/Cellar/berkeley-db4/4.8.30

pip install bsddb3
pip install gutenberg

# you should be good to go.
	# Command pipeline
	# Non hadoop way using only linux commands.
	cat data \| map \| sort \| reduce
	# some text to mapper.
	echo "foo foo quux labs foo bar quux" \| <dirto/>mapper.py
	# Some text to mapper then sort and then reducer.
	echo "foo foo quux labs foo bar quux" \| <dirto/>mapper.py \| sort-K1, 1 \| <dirto/>reducer.py
	# Doing with file.
	cat myfilename.txt \| <dirto/>mapper.py \| sort-K1, 1 \| <dirto/>reducer.py

	# hadoop specific commands
	# use shell script to start/stop hadoop.
	# Copy the file to Hadoop
	# HDFS:
	>> hadoop dfs -copyFromLocal /my/tmp/location /user/hduser/guttenburg

	>> hduser@ubuntu:/usr/local/hadoop$ bin/hadoop jar contrib/streaming/hadoop-streaming.jar \
	-file /home/hduser/mapper.py -mapper /home/hduser/mapper.py \
	-file /home/hduser/reducer.py -reducer /home/hduser/reducer.py \
	-input /user/hduser/gutenberg/* -output /user/hduser/gutenberg-output
	For macOS
	# Install homebrew
	Install berkeley-db4 through brew
	# Don't install using "berkeley-db" latest version it requires license.
	brew install berkeley-db4
	#Set the BERKELEY_DB environment variable to point to the brew installation before running pip install bsddb3
	export BERKELEYDB_DIR=/usr/local/Cellar/berkeley-db4/4.8.30

	pip install bsddb3
	pip install gutenberg

	# you should be good to go.