Skip to content

Instantly share code, notes, and snippets.

@pracucci
Created May 23, 2018 16:29
Show Gist options
  • Save pracucci/3ce7e162ad640dada72c5c20d95b781e to your computer and use it in GitHub Desktop.
Save pracucci/3ce7e162ad640dada72c5c20d95b781e to your computer and use it in GitHub Desktop.
Compile and run Spark JobServer for Amazon EMR
FROM ubuntu:16.04
# Config
ARG SBT_VERSION=0.13.12
ARG SJS_VERSION=0.8.0
ARG EMR_VERSION=5.13.0
ARG SPARK_VERSION=2.3.0
# Install JDK 8 and some dependencies
RUN apt-get update -qq && apt-get install -y -qq openjdk-8-jdk wget python python-pip
RUN pip install py4j pyhocon pep8 pyspark
# Ensure the default Java max memory is greater than ~600MB otherwise
# the python's SubprocessSpec.scala tests won't run
RUN if [ $(java -XX:+PrintFlagsFinal -version 2> /dev/null | grep MaxHeapSize | awk '{ print $4 }') -lt 629145600 ]; then \
echo "Please run Docker with more memory (ie. on OSX you can increase it from Docker Preferences)"; \
exit 1; \
fi
# Install sbt (dependency)
RUN wget https://dl.bintray.com/sbt/debian/sbt-${SBT_VERSION}.deb && \
dpkg -i sbt-${SBT_VERSION}.deb && \
rm -f sbt-${SBT_VERSION}.deb
# Define the SPARK_VERSION to compile against, read in project/Versions.scala
ENV SPARK_VERSION="${SPARK_VERSION}"
# Download Spark JobServer sources
RUN wget https://github.com/spark-jobserver/spark-jobserver/archive/v${SJS_VERSION}.tar.gz
RUN tar -zxf v${SJS_VERSION}.tar.gz
# Cherry pick the PR #1008 to fix context info route when "context-per-jvm = true"
RUN cd spark-jobserver-${SJS_VERSION} && \
wget https://github.com/spark-jobserver/spark-jobserver/pull/1008.diff && \
patch -p1 < 1008.diff
# Compile Spark JobServer from sources. We run assembly in a dedicated
# step because it's likely to fail, yet we don't want to loose the benefits
# of Docker layers caching - given "sbt package" is very slow
RUN cd spark-jobserver-${SJS_VERSION} && sbt clean update package
RUN cd spark-jobserver-${SJS_VERSION} && sbt assembly
# Install config files
ADD config/emr-${EMR_VERSION}.conf spark-jobserver-${SJS_VERSION}/config/emr-${EMR_VERSION}.conf
ADD config/emr-${EMR_VERSION}.sh spark-jobserver-${SJS_VERSION}/config/emr-${EMR_VERSION}.sh
ADD config/shiro.ini spark-jobserver-${SJS_VERSION}/config/shiro.ini
# Package
RUN cd spark-jobserver-${SJS_VERSION} && ./bin/server_package.sh emr-${EMR_VERSION}
# Rename the package, adding the EMR version
RUN mv /tmp/job-server/job-server.tar.gz /tmp/job-server/spark-job-server-${SJS_VERSION}-emr-${EMR_VERSION}.tar.gz
# Copy the tests jar into the job-server package output dir so that it's easier to pick up
RUN cp spark-jobserver-${SJS_VERSION}/job-server-tests/target/scala-2.11/job-server-tests_2.11-${SJS_VERSION}.jar /tmp/job-server/spark-job-server-tests-${SJS_VERSION}.jar
spark {
# spark.master will be passed to each job's JobContext
master = "yarn-client"
# This needs to match SPARK_HOME for cluster SparkContexts to be created successfully
home = "/usr/lib/spark"
jobserver {
port = 8090
# Required when master = "yarn-client"
context-per-jvm = true
# Note: JobFileDAO is deprecated from v0.7.0 because of issues in
# production and will be removed in future, now defaults to H2 file.
jobdao = spark.jobserver.io.JobSqlDAO
filedao {
rootdir = /mnt/tmp/spark-jobserver/filedao/data
}
sqldao {
# Slick database driver, full classpath
slick-driver = slick.driver.H2Driver
# JDBC driver, full classpath
jdbc-driver = org.h2.Driver
# Directory where default H2 driver stores its data. Only needed for H2.
rootdir = /mnt/tmp/spark-jobserver/sqldao/data
# Full JDBC URL / init string, along with username and password. Sorry, needs to match above.
# Substitutions may be used to launch job-server, but leave it out here in the default or tests won't pass
jdbc {
url = "jdbc:h2:file:/mnt/tmp/spark-jobserver/sqldao/data/h2-db"
user = ""
password = ""
}
# DB connection pool settings
dbcp {
enabled = false
}
}
}
# Predefined Spark contexts
contexts {}
# Default context settings (if not overridden by the specific context)
context-settings {
num-cpu-cores = 1
memory-per-node = 1G
spark.executor.instances = 2
# If you wish to pass any settings directly to the sparkConf as-is, add them here in passthrough,
# such as hadoop connection settings that don't use the "spark." prefix
passthrough {
#es.nodes = "192.1.1.1"
}
}
}
#
# Default values
#
appdir=${appdir:-}
conffile=${conffile:-}
if [ -z "$appdir" ]; then
# Set to the directory where this script is located
appdir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
fi
if [ -z "$conffile" ]; then
# Set the config file to be located in the same directored of this
# script, with the same name, but ending with .conf instead of .sh
conffile="$appdir/emr-5.13.0.conf"
fi
#
# Customizations
#
APP_USER=hadoop
APP_GROUP=hadoop
INSTALL_DIR=/mnt/lib/spark-jobserver
LOG_DIR=/mnt/var/log/spark-jobserver
PIDFILE=spark-jobserver.pid
JOBSERVER_MEMORY=1G
SPARK_VERSION=2.3.0
SPARK_HOME=/usr/lib/spark
SPARK_CONF_DIR=/etc/spark/conf
HADOOP_CONF_DIR=/etc/hadoop/conf
YARN_CONF_DIR=/etc/hadoop/conf
SCALA_VERSION=2.11.8
MANAGER_JAR_FILE="$appdir/spark-job-server.jar"
MANAGER_CONF_FILE="$(basename $conffile)"
MANAGER_EXTRA_JAVA_OPTIONS=
MANAGER_EXTRA_SPARK_CONFS="spark.yarn.submit.waitAppCompletion=false|spark.files=$appdir/log4jcluster.properties,$conffile"
MANAGER_LOGGING_OPTS="-Dlog4j.configuration=log4j-cluster.properties"
# Shiro is used for authentication. Authentication is disabled in our
# cluster, so we don't need it, but the file needs to exist in order
# to get server_package.sh run successfully.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment