randerzander/distget

## distget
# Put a list of URLs in a file, inputs.txt
echo ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/2009.csv.gz > input.txt
echo ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/2010.csv.gz >> input.txt

# Trick to force MapReduce to treat each line in inputs.txt as a single map task
mkdir input
cd input
# I used 1 line per file - tweak for smaller downloads & fewer map tasks
split -l 1 ../input.txt
hadoop fs -put input .

# The 'hadoop fs -put' command in distget.sh runs as 'yarn', not current linux user.
# Make the output dir accessible to yarn.
hadoop fs -chmod 777 /user/randy/data

# distget.sh
while read line
do
  wget $line
  file=${line##/*}
  hadoop fs -put $file /user/randy/data/
done

# Run the job
hadoop jar /usr/hdp/current/hadoop-mapreduce-client/hadoop-streaming.jar -D mapred.reduce.tasks=0 \
  -file distget.sh -mapper distget.sh -input input -output dummy
	# Put a list of URLs in a file, inputs.txt
	echo ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/2009.csv.gz > input.txt
	echo ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/2010.csv.gz >> input.txt

	# Trick to force MapReduce to treat each line in inputs.txt as a single map task
	mkdir input
	cd input
	# I used 1 line per file - tweak for smaller downloads & fewer map tasks
	split -l 1 ../input.txt
	hadoop fs -put input .

	# The 'hadoop fs -put' command in distget.sh runs as 'yarn', not current linux user.
	# Make the output dir accessible to yarn.
	hadoop fs -chmod 777 /user/randy/data

	# distget.sh
	while read line
	do
	wget $line
	file=${line##/*}
	hadoop fs -put $file /user/randy/data/
	done

	# Run the job
	hadoop jar /usr/hdp/current/hadoop-mapreduce-client/hadoop-streaming.jar -D mapred.reduce.tasks=0 \
	-file distget.sh -mapper distget.sh -input input -output dummy