Skip to content

Instantly share code, notes, and snippets.

@tamizhgeek
Created January 10, 2020 14:40
Show Gist options
  • Save tamizhgeek/d49da38e5aaf375203e42473c56dd24e to your computer and use it in GitHub Desktop.
Save tamizhgeek/d49da38e5aaf375203e42473c56dd24e to your computer and use it in GitHub Desktop.
GoCD agent with EMR goodies to connect to EMR and execute spark jobs
#!/usr/bin/env bash
set -e
usage(){
echo """
usage:
build_gocd_agent_with_emr_support.sh --ssh_key <SSH key used to log into EMR master> --cluster_ip <IP of the EMR master>
"""
}
if [ $# -eq 0 ]; then
usage
exit 1
fi
ssh_key=""
cluster_ip=""
git_revision=`git rev-parse HEAD`
# parse the arguements
while : ; do
case "$1" in
--ssh_key)
[ -n "${ssh_key}" ] && usage
ssh_key="$2"
shift 2 ;;
--cluster_ip)
[ -n "${cluster_ip}" ] && usage
cluster_ip="$2"
shift 2 ;;
*)
break ;;
esac
done
cleanup() {
find . -name "hadoop-binaries-config*" | xargs rm
}
trap cleanup EXIT
ssh -i $ssh_key ec2-user@$cluster_ip <<EOF
rm -rf /tmp/hadoop-binaries-configs
echo "creating dirs..."
mkdir /tmp/hadoop-binaries-configs
mkdir /tmp/hadoop-binaries-configs/configs
mkdir -p /tmp/hadoop-binaries-configs/configs/hadoop
mkdir -p /tmp/hadoop-binaries-configs/configs/hive
mkdir -p /tmp/hadoop-binaries-configs/configs/spark
mkdir -p /tmp/hadoop-binaries-configs/configs/sqoop
echo "copying configs now"
cp -rL /etc/hadoop/conf /tmp/hadoop-binaries-configs/configs/hadoop
cp -rL /etc/hive/conf /tmp/hadoop-binaries-configs/configs/hive
cp -rL /etc/spark/conf /tmp/hadoop-binaries-configs/configs/spark
cp -rL /etc/sqoop/conf /tmp/hadoop-binaries-configs/configs/sqoop
cp -r /usr/lib/hadoop /tmp/hadoop-binaries-configs
cp -r /usr/lib/hadoop-mapreduce /tmp/hadoop-binaries-configs
cp -r /usr/lib/hadoop-lzo /tmp/hadoop-binaries-configs
cp -r /usr/lib/spark /tmp/hadoop-binaries-configs
cp -r /usr/lib/sqoop /tmp/hadoop-binaries-configs
cp -r /usr/lib/hadoop-hdfs /tmp/hadoop-binaries-configs
cp -r /usr/lib/hadoop-yarn /tmp/hadoop-binaries-configs
cp -r /usr/share/aws /tmp/hadoop-binaries-configs
cp -r /usr/share/java /tmp/hadoop-binaries-configs
cd /tmp
rm -rf hadoop-binaries-configs.tar.gz
echo "creating archive..."
tar -zcf hadoop-binaries-configs.tar.gz hadoop-binaries-configs
EOF
repo_prefix="<REPLACE_WITH_YOUR_DOCKER_REPOSITORY_PREFIX>"
scp -i $ssh_key ec2-user@${cluster_ip}:/tmp/hadoop-binaries-configs.tar.gz .
docker build -t "${repo_prefix}-${cluster_ip}:latest" .
docker tag "${repo_prefix}-${cluster_ip}:latest" "${repo_prefix}-${cluster_ip}:${git_revision}"
$(aws ecr get-login --no-include-email --region eu-west-1)
docker push ${repo_prefix}-${cluster_ip}:${git_revision}
ARG GOCD_VERSION=19.5.0
FROM gocd/gocd-agent-centos-7:v$GOCD_VERSION
# install java and friends
RUN yum -y update && \
yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm && \
yum install -y java-1.8.0-openjdk-devel xmlstarlet which
# setup EMR shenanigans
RUN mkdir /tmp/emr-hadoop-downloads
COPY hadoop-binaries-configs.tar.gz /tmp/emr-hadoop-downloads
### Copy hadoop, hive, and spark configurations
### Copy emr jars to the right locations
RUN tar xzf /tmp/emr-hadoop-downloads/hadoop-binaries-configs.tar.gz -C /tmp/emr-hadoop-downloads/ && \
cp -r /tmp/emr-hadoop-downloads/hadoop-binaries-configs/configs/hadoop /etc/hadoop && \
cp -r /tmp/emr-hadoop-downloads/hadoop-binaries-configs/configs/hive /etc/hive && \
cp -r /tmp/emr-hadoop-downloads/hadoop-binaries-configs/configs/spark /etc/spark && \
cp -r /tmp/emr-hadoop-downloads/hadoop-binaries-configs/configs/sqoop /etc/sqoop && \
mv /tmp/emr-hadoop-downloads/hadoop-binaries-configs/aws /usr/share/aws && \
mv /tmp/emr-hadoop-downloads/hadoop-binaries-configs/hadoop /usr/lib/hadoop && \
mv /tmp/emr-hadoop-downloads/hadoop-binaries-configs/hadoop-hdfs /usr/lib/hadoop-hdfs && \
mv /tmp/emr-hadoop-downloads/hadoop-binaries-configs/hadoop-mapreduce /usr/lib/hadoop-mapreduce && \
mv /tmp/emr-hadoop-downloads/hadoop-binaries-configs/hadoop-yarn /usr/lib/hadoop-yarn && \
mv /tmp/emr-hadoop-downloads/hadoop-binaries-configs/hadoop-lzo /usr/lib/hadoop-lzo && \
mv /tmp/emr-hadoop-downloads/hadoop-binaries-configs/spark /usr/lib/spark && \
mv /tmp/emr-hadoop-downloads/hadoop-binaries-configs/sqoop /usr/lib/sqoop && \
cp -R /tmp/emr-hadoop-downloads/hadoop-binaries-configs/java/* /usr/share/java/ && \
rm -rf /tmp/emr-hadoop-downloads
### Update SPARK and HADOOP environment variables.
ENV JAVA_HOME=/usr/lib/jvm/java-1.8.0/
ENV HADOOP_HOME=/usr/lib/hadoop
ENV HADOOP_HDFS_HOME=/usr/lib/hadoop-hdfs
ENV HADOOP_MAPREDUCE_HOME=/usr/lib/hadoop-mapreduce
ENV HADOOP_YARN_HOME=/usr/lib/hadoop-yarn
ENV HADOOP_LIBEXEC_DIR=/usr/lib/hadoop/libexec
ENV SPARK_HOME=/usr/lib/spark
ENV SQOOP_HOME=/usr/lib/sqoop
ENV PATH=${PATH:-}:${SPARK_HOME:-}/bin:${HADOOP_HOME:-}/bin:${SQOOP_HOME:-}/bin:/usr/lib/hadoop-hdfs/bin
# ENV PYTHONPATH=${PYTHONPATH:-}:${SPARK_HOME:-}/python/
# ENV PYTHONPATH=${PYTHONPATH:-}:${SPARK_HOME:-}/python/lib/py4j-0.10.7-src.zip
ENV HADOOP_CONF_DIR=/etc/hadoop/conf
ENV YARN_CONF_DIR=/etc/hadoop/conf
ENV SPARK_CONF_DIR=/etc/spark/conf
ENV HADOOP_USER_NAME=hdfs
#### Temp directories for hadoop
RUN mkdir /mnt1 /mnt2 /mnt3 && \
chown go:root -R /mnt /mnt1 /mnt2 /mnt3 && \
mkdir /var/log/spark && \
chown go:root -R /var/log/spark
### Update the mapred-site.xml with dynamoDB access
RUN xmlstarlet edit -L --omit-decl \
-s '//configuration' -t elem -n "property" \
-s "//configuration/property[last()]" -t elem -n "name" -v "fs.s3.consistent.dynamodb.endpoint" \
-s "//configuration/property[last()]" -t elem -n "value" -v "dynamodb.eu-west-1.amazonaws.com" \
-s '//configuration' -t elem -n "property" \
-s "//configuration/property[last()]" -t elem -n "name" -v "fs.s3.awsAccessKeyId" \
-s "//configuration/property[last()]" -t elem -n "value" -v "_ACCESS_KEY_ID_" \
-s '//configuration' -t elem -n "property" \
-s "//configuration/property[last()]" -t elem -n "name" -v "fs.s3.awsSecretAccessKey" \
-s "//configuration/property[last()]" -t elem -n "value" -v "_SECRET_KEY_" \
$HADOOP_CONF_DIR/mapred-site.xml
### set some default spark config variables
RUN echo "spark.yarn.am.nodeLabelExpression " >> /etc/spark/conf/spark-defaults.conf && \
echo "spark.dynamicAllocation.enabled true" >> /etc/spark/conf/spark-defaults.conf
# override entrypoint to supply credentials at runtime
ADD pimped_entrypoint.sh /
ENTRYPOINT ["/pimped_entrypoint.sh"]
#!/bin/bash
# Inject secrets into all files
sed -i "s/_ACCESS_KEY_ID_/${ACCESS_KEY_ID}/g" $HADOOP_CONF_DIR/mapred-site.xml
sed -i "s~_SECRET_KEY_~${SECRET_KEY}~g" $HADOOP_CONF_DIR/mapred-site.xml
./docker-entrypoint.sh
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment