Skip to content

Instantly share code, notes, and snippets.

@immanuelpotter
Last active December 18, 2018 16:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save immanuelpotter/17af75c6e3efef8f2b44e5a52880e435 to your computer and use it in GitHub Desktop.
Save immanuelpotter/17af75c6e3efef8f2b44e5a52880e435 to your computer and use it in GitHub Desktop.
#!/bin/bash
set -ex
#####################################################################################################################
# 1. Change the aws_cmd function to match the naming of your aws credentials
# 2. Check the DATA_PATH var is where your files are ON S3 (or wherever else)
# 3. Make sure you're using the right SSH key
#####################################################################################################################
DNS_NAME="$1"
DATA_FILES=""
DATA_PATH="s3://mannys-hadoop-data/data/"
SSH_KEY_PATH="~/keys/EMR-CLUSTER.pem"
success() {
printf '%b\n' ""
printf '%b\n' "\033[1;32m[SUCCESS] $@\033[0m"
printf '%b\n' ""
}
warn(){
printf '%b\n' ""
printf '%b\n' "\033[1;33m[WARN] $@\033[0m"
printf '%b\n' ""
}
err() {
printf '%b\n' ""
printf '%b\n' "\033[1;31m[ERROR] $@\033[0m"
printf '%b\n' ""
exit 1
} >&2
aws_cmd(){
aws --profile immanuel --region eu-west-2 "$@"
}
ssh_cmd(){
ssh -i $SSH_KEY_PATH -l hadoop "$@"
}
data_ops(){
ssh_cmd "${DNS_NAME}" "sudo /usr/lib/spark/sbin/start-thriftserver.sh" || warn "Could not start thriftserver."
DATA_FILES="$(aws_cmd s3 ls ${DATA_PATH} | awk '{print $4}')"
aws_cmd s3 cp "$DATA_PATH$DATA_FILES" .
scp -i $SSH_KEY_PATH $DATA_FILES hadoop@${DNS_NAME}:/tmp/ || err "SCP failed."
for i in $DATA_FILES ; do ssh_cmd ${DNS_NAME} "hdfs dfs -put /tmp/${i}" ; done || err "HDFS put failed."
}
main(){
if [[ -z $DNS_NAME ]] ; then
err "Please provide the DNS name of your EMR cluster."
fi
success $(data_ops)
}
main
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment