Skip to content

Instantly share code, notes, and snippets.

@jbenninghoff
Last active April 13, 2023 23:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jbenninghoff/8df42fa99914dbcad63aec79044ced79 to your computer and use it in GitHub Desktop.
Save jbenninghoff/8df42fa99914dbcad63aec79044ced79 to your computer and use it in GitHub Desktop.
Launch EMR, MR job, then terminate
#!/usr/bin/env bash
# jbenninghoff@ 2023-Mar-24
# Script to run XML extraction job from cron
# Alternatetively use Step Functions instead of cron:
# https://docs.aws.amazon.com/en_us/step-functions/latest/dg/sample-emr-job.html
# Or use AWS Data Pipeline:
# https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-manage-recurring.html
#set -o nounset; set -o errexit; set -o pipefail
set -o errexit; set -o pipefail
export AWS_PROFILE=jobennin+workday
# Required variables to run this script
subnetid=subnet-057d3621ea39f57e9 # Private subnet with NAT
#subnetid=subnet-089b35dcfb410ccf1 # Must exist in your account's VPC
#subnetid=subnet-07d569c2a149af766 # Must exist in your account's VPC
key_name=jobennin+workday+usw2
conf_bucket=jb-workday-artifacts
log_bucket=jb-workday-logs
data_bucket=jobennin-emr-data
secconf=usw2-tls #Existing EMR Security Configuration
#Push run-job-big.sh script to S3
aws s3 cp run-job-big.sh s3://$conf_bucket/run-job-big.sh
#Check for input in S3
if ( aws s3 ls s3://$data_bucket/hp-mapr/input/ > /dev/null ); then
:
else
echo Input data set not in S3
exit 1
fi
#Check for existing output in S3
if ( aws s3 ls s3://$data_bucket/hp-mapr/output/ > /dev/null ); then
echo Remove output folder in S3
echo Run: aws s3 rm s3://$data_bucket/hp-mapr/output/ --recursive
exit 1
else
:
fi
# Check for required instance profile and associated roles
if ( aws iam list-instance-profiles --output json |& grep -q 'instance-profile/EMR_EC2_DefaultRole' ); then
:
else
aws emr create-default-roles >& jb-EMR-XMLx-default-roles.json
sleep 5
aws iam list-instance-profiles |grep instance-profile/EMR_EC2_DefaultRole # Instance profile sometimes takes time
fi
igroupConf() { # Instance Group config for EMR cluster
cat << EOF1
[
{"InstanceCount":${1:-1},
"InstanceGroupType":"TASK","InstanceType":"m6g.8xlarge","Name":"Worker nodes"
},
{"InstanceCount":3,
"InstanceGroupType":"CORE","InstanceType":"m6g.8xlarge","Name":"Core nodes"
},
{"InstanceCount":1,
"InstanceGroupType":"MASTER","InstanceType":"m6g.4xlarge","Name":"Master nodes"
}
]
EOF1
}
emrConfig() { # Add EMR configuration file settings
# Graceful timeout, set at 3hrs, required for managed scaling
# 3hrs could be set lower(saves more money) but needs testing to be safe and sure
cat << EOF1
[
{
"Classification":"emrfs-site",
"Properties":{
"fs.s3.maxConnections": "10000"
}
},
{
"Classification":"yarn-site",
"Properties":{
"yarn.resourcemanager.nodemanager-graceful-decommission-timeout-secs": "10800"
}
}
]
EOF1
}
managedScaling() { # Use with: --managed-scaling-policy "$(managedScaling)"
# Allows cluster to scale down as Maps are done, creates upper bound as well
cat << EOF
{
"ComputeLimits": {
"MinimumCapacityUnits": 5,
"MaximumCapacityUnits": 84,
"MaximumCoreCapacityUnits": 3,
"MaximumOnDemandCapacityUnits": 84,
"UnitType": "Instances"
}
}
EOF
}
xmlExtract() { # Run XML Extraction script uploaded to S3
cat << EOF3
[
{
"Name": "XML_EXTRACT",
"Args": ["s3://$conf_bucket/run-job-big.sh"],
"Jar": "s3://$AWS_DEFAULT_REGION.elasticmapreduce/libs/script-runner/script-runner.jar",
"ActionOnFailure": "CONTINUE",
"Type": "CUSTOM_JAR"
}
]
EOF3
# "Args": ["s3://$conf_bucket/tls_emr_svcs.sh", "${1:-trino}"],
}
aws emr create-cluster --name "jb-HP-XML-big" \
--release-label emr-6.10.0 \
--applications Name=Hadoop Name=Ganglia \
--ec2-attributes "KeyName=$key_name,SubnetId=$subnetid" \
--instance-groups "$(igroupConf ${1:-5})" \
--managed-scaling-policy "$(managedScaling)" \
--ebs-root-volume-size 100 \
--configurations "$(emrConfig)" \
--security-configuration $secconf \
--steps "$(xmlExtract)" \
--auto-terminate \
--enable-debugging \
--log-uri "s3://$log_bucket/" \
--use-default-roles
# Optional
: << '--BLOCK-COMMENT--'
--auto-terminate \
--configurations "$(emrConfig)" \
--managed-scaling-policy "$(managedScaling)"
--enable-debugging \
--log-uri "s3://$log_bucket/" \
--BLOCK-COMMENT--
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment