snoremac/emr-examples-single-sequence-single-node-streaming.sh

## emr-examples-single-sequence-single-node-streaming.sh
# Launch a cluster and run the word count against a small portion of
# 1 crawl segment.
#
# The input data is read directly from S3.
#
# Here I've omitted the --alive argument, so the cluster will self-terminate
# once the job is complete.

elastic-mapreduce \
  --create \
  --name "Common Crawl word count" \
  --enable-debugging \
  --stream \
  --ami-version latest \
  --instance-type m1.large \
  --input s3://aws-publicdatasets/common-crawl/parse-output/segment/1346823845675/textData-03894 \
  --output s3n://emr-examples.dius.com.au/output \
  --mapper 's3://emr-examples.dius.com.au/ruby/common_crawl_mapper.rb hello,world' \
  --reducer s3://emr-examples.dius.com.au/ruby/common_crawl_reducer.rb \
  --arg -inputformat --arg org.apache.hadoop.mapred.SequenceFileAsTextInputFormat
	# Launch a cluster and run the word count against a small portion of
	# 1 crawl segment.
	#
	# The input data is read directly from S3.
	#
	# Here I've omitted the --alive argument, so the cluster will self-terminate
	# once the job is complete.

	elastic-mapreduce \
	--create \
	--name "Common Crawl word count" \
	--enable-debugging \
	--stream \
	--ami-version latest \
	--instance-type m1.large \
	--input s3://aws-publicdatasets/common-crawl/parse-output/segment/1346823845675/textData-03894 \
	--output s3n://emr-examples.dius.com.au/output \
	--mapper 's3://emr-examples.dius.com.au/ruby/common_crawl_mapper.rb hello,world' \
	--reducer s3://emr-examples.dius.com.au/ruby/common_crawl_reducer.rb \
	--arg -inputformat --arg org.apache.hadoop.mapred.SequenceFileAsTextInputFormat