Skip to content

Instantly share code, notes, and snippets.

@jbenninghoff
Last active April 12, 2023 03:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jbenninghoff/c595c304ae8677a1b55ddfc4fe74960b to your computer and use it in GitHub Desktop.
Save jbenninghoff/c595c304ae8677a1b55ddfc4fe74960b to your computer and use it in GitHub Desktop.
Launch XML extract job in S3
#!/bin/bash
#
# Use this to capture output into var using read pipeline
set +m; shopt -s lastpipe
# Copy JARs and XML locally, files needed as args to job launch
aws s3 cp s3://jobennin-emr-data/hp-mapr/java_extraction_byteswritable.jar .
aws s3 cp s3://jobennin-emr-data/hp-mapr/configint.xml .
aws s3 cp s3://jobennin-emr-data/hp-mapr/commons-lang-2.6.jar .
hadoop jar java_extraction_byteswritable.jar qari.XmlExtraction \
-libjars $PWD/commons-lang-2.6.jar \
-D mapreduce.job.reduces=1 \
-D mapreduce.map.memory.mb=4000 -D mapreduce.map.java.opts="-Xmx3600m" \
-D mapreduce.reduce.memory.mb=36864 -D mapreduce.reduce.java.opts="-Xmx33648m" \
-D mapreduce.reduce.cpu.vcores=8 \
-D mapreduce.reduce.shuffle.parallelcopies=8 \
-D mapreduce.task.io.sort.mb=800 \
-D mapreduce.task.io.sort.factor=80 \
-D mapreduce.map.sort.spill.percent=1.0 \
-D mapreduce.map.speculative=false \
-D mapreduce.map.output.compress=true \
-D mapreduce.input.fileinputformat.split.minsize=41943040 \
s3://jobennin-emr-data/hp-mapr/input/ s3://jobennin-emr-data/hp-mapr/output/ configint.xml \
> /tmp/qari.XmlExtraction.stdout.log
# Log the job history stats to S3
#jobnum=$(egrep -o 'job_[0-9]+_[0-9]+' ~/qari.XmlExtraction.stdout.log |sort -u)
cid=$(jq -r .jobFlowId /mnt/var/lib/info/job-flow.json)
mapred job -list all |& egrep -o 'job_[0-9]+_[0-9]+' |sort -n |tail -1 |read jobnum
mapred job -history $jobnum | aws s3 cp - s3://jb-workday-logs/hp-mapr/${cid}-XmlExtract.hist
aws s3 cp /tmp/qari.XmlExtraction.stdout.log s3://jb-workday-logs/hp-mapr/${cid}-XmlExtract.stdout.log
#https://hadoop.apache.org/docs/r2.10.0/hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml
# -D mapreduce.fileoutputcommitter.algorithm.version=2 \
# -D mapreduce.map.memory.mb=5280 -D mapreduce.map.java.opts="-Xmx4800m" \
# -D mapreduce.job.reduce.slowstart.completedmaps=0.01 \
# -D mapreduce.input.fileinputformat.split.minsize=20971520
# -D mapreduce.shuffle.connection-keep-alive.enable=true
# -D mapreduce.task.io.sort.factor=48
# -D mapreduce.reduce.memory.mb=20480 -D mapreduce.reduce.java.opts="-Xmx18432m" \
#time hadoop jar java_extraction_byteswritable.jar qari.XmlExtraction -D mapreduce.job.reduces=1 /user/hadoop/input/ /user/hadoop/output/ configint.xml
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment