pracucci/Dockerfile

## Dockerfile
FROM ubuntu:16.04

# Config
ARG SBT_VERSION=0.13.12
ARG SJS_VERSION=0.8.0
ARG EMR_VERSION=5.13.0
ARG SPARK_VERSION=2.3.0

# Install JDK 8 and some dependencies
RUN apt-get update -qq && apt-get install -y -qq openjdk-8-jdk wget python python-pip
RUN pip install py4j pyhocon pep8 pyspark

# Ensure the default Java max memory is greater than ~600MB otherwise
# the python's SubprocessSpec.scala tests won't run
RUN if [ $(java -XX:+PrintFlagsFinal -version 2> /dev/null | grep MaxHeapSize | awk '{ print $4 }') -lt 629145600 ]; then \
        echo "Please run Docker with more memory (ie. on OSX you can increase it from Docker Preferences)"; \
        exit 1; \
    fi

# Install sbt (dependency)
RUN wget https://dl.bintray.com/sbt/debian/sbt-${SBT_VERSION}.deb && \
    dpkg -i sbt-${SBT_VERSION}.deb && \
    rm -f sbt-${SBT_VERSION}.deb

# Define the SPARK_VERSION to compile against, read in project/Versions.scala
ENV SPARK_VERSION="${SPARK_VERSION}"

# Download Spark JobServer sources
RUN wget https://github.com/spark-jobserver/spark-jobserver/archive/v${SJS_VERSION}.tar.gz
RUN tar -zxf v${SJS_VERSION}.tar.gz

# Cherry pick the PR #1008 to fix context info route when "context-per-jvm = true"
RUN cd spark-jobserver-${SJS_VERSION} && \
    wget https://github.com/spark-jobserver/spark-jobserver/pull/1008.diff && \
    patch -p1 < 1008.diff

# Compile Spark JobServer from sources. We run assembly in a dedicated
# step because it's likely to fail, yet we don't want to loose the benefits
# of Docker layers caching - given "sbt package" is very slow
RUN cd spark-jobserver-${SJS_VERSION} && sbt clean update package
RUN cd spark-jobserver-${SJS_VERSION} && sbt assembly

# Install config files
ADD config/emr-${EMR_VERSION}.conf  spark-jobserver-${SJS_VERSION}/config/emr-${EMR_VERSION}.conf
ADD config/emr-${EMR_VERSION}.sh    spark-jobserver-${SJS_VERSION}/config/emr-${EMR_VERSION}.sh
ADD config/shiro.ini                spark-jobserver-${SJS_VERSION}/config/shiro.ini

# Package
RUN cd spark-jobserver-${SJS_VERSION} && ./bin/server_package.sh emr-${EMR_VERSION}

# Rename the package, adding the EMR version
RUN mv /tmp/job-server/job-server.tar.gz /tmp/job-server/spark-job-server-${SJS_VERSION}-emr-${EMR_VERSION}.tar.gz

# Copy the tests jar into the job-server package output dir so that it's easier to pick up
RUN cp spark-jobserver-${SJS_VERSION}/job-server-tests/target/scala-2.11/job-server-tests_2.11-${SJS_VERSION}.jar /tmp/job-server/spark-job-server-tests-${SJS_VERSION}.jar

## emr-5.13.0.conf
spark {
  # spark.master will be passed to each job's JobContext
  master = "yarn-client"

  # This needs to match SPARK_HOME for cluster SparkContexts to be created successfully
  home = "/usr/lib/spark"

  jobserver {
    port = 8090

    # Required when master = "yarn-client"
    context-per-jvm = true

    # Note: JobFileDAO is deprecated from v0.7.0 because of issues in
    # production and will be removed in future, now defaults to H2 file.
    jobdao = spark.jobserver.io.JobSqlDAO

    filedao {
      rootdir = /mnt/tmp/spark-jobserver/filedao/data
    }

    sqldao {
      # Slick database driver, full classpath
      slick-driver = slick.driver.H2Driver

      # JDBC driver, full classpath
      jdbc-driver = org.h2.Driver

      # Directory where default H2 driver stores its data. Only needed for H2.
      rootdir = /mnt/tmp/spark-jobserver/sqldao/data

      # Full JDBC URL / init string, along with username and password.  Sorry, needs to match above.
      # Substitutions may be used to launch job-server, but leave it out here in the default or tests won't pass
      jdbc {
        url      = "jdbc:h2:file:/mnt/tmp/spark-jobserver/sqldao/data/h2-db"
        user     = ""
        password = ""
      }

      # DB connection pool settings
      dbcp {
        enabled = false
      }
    }
  }

  # Predefined Spark contexts
  contexts {}

  # Default context settings (if not overridden by the specific context)
  context-settings {
    num-cpu-cores            = 1
    memory-per-node          = 1G
    spark.executor.instances = 2

    # If you wish to pass any settings directly to the sparkConf as-is, add them here in passthrough,
    # such as hadoop connection settings that don't use the "spark." prefix
    passthrough {
      #es.nodes = "192.1.1.1"
    }
  }
}

## emr-5.13.0.sh
#
# Default values
#

appdir=${appdir:-}
conffile=${conffile:-}

if [ -z "$appdir" ]; then
    # Set to the directory where this script is located
    appdir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
fi

if [ -z "$conffile" ]; then
    # Set the config file to be located in the same directored of this
    # script, with the same name, but ending with .conf instead of .sh
    conffile="$appdir/emr-5.13.0.conf"
fi

#
# Customizations
#

APP_USER=hadoop
APP_GROUP=hadoop
INSTALL_DIR=/mnt/lib/spark-jobserver
LOG_DIR=/mnt/var/log/spark-jobserver
PIDFILE=spark-jobserver.pid
JOBSERVER_MEMORY=1G
SPARK_VERSION=2.3.0
SPARK_HOME=/usr/lib/spark
SPARK_CONF_DIR=/etc/spark/conf
HADOOP_CONF_DIR=/etc/hadoop/conf
YARN_CONF_DIR=/etc/hadoop/conf
SCALA_VERSION=2.11.8
MANAGER_JAR_FILE="$appdir/spark-job-server.jar"
MANAGER_CONF_FILE="$(basename $conffile)"
MANAGER_EXTRA_JAVA_OPTIONS=
MANAGER_EXTRA_SPARK_CONFS="spark.yarn.submit.waitAppCompletion=false|spark.files=$appdir/log4jcluster.properties,$conffile"
MANAGER_LOGGING_OPTS="-Dlog4j.configuration=log4j-cluster.properties"

## shiro.ini
# Shiro is used for authentication. Authentication is disabled in our
# cluster, so we don't need it, but the file needs to exist in order
# to get server_package.sh run successfully.
	FROM ubuntu:16.04

	# Config
	ARG SBT_VERSION=0.13.12
	ARG SJS_VERSION=0.8.0
	ARG EMR_VERSION=5.13.0
	ARG SPARK_VERSION=2.3.0

	# Install JDK 8 and some dependencies
	RUN apt-get update -qq && apt-get install -y -qq openjdk-8-jdk wget python python-pip
	RUN pip install py4j pyhocon pep8 pyspark

	# Ensure the default Java max memory is greater than ~600MB otherwise
	# the python's SubprocessSpec.scala tests won't run
	RUN if [ $(java -XX:+PrintFlagsFinal -version 2> /dev/null \| grep MaxHeapSize \| awk '{ print $4 }') -lt 629145600 ]; then \
	echo "Please run Docker with more memory (ie. on OSX you can increase it from Docker Preferences)"; \
	exit 1; \
	fi

	# Install sbt (dependency)
	RUN wget https://dl.bintray.com/sbt/debian/sbt-${SBT_VERSION}.deb && \
	dpkg -i sbt-${SBT_VERSION}.deb && \
	rm -f sbt-${SBT_VERSION}.deb

	# Define the SPARK_VERSION to compile against, read in project/Versions.scala
	ENV SPARK_VERSION="${SPARK_VERSION}"

	# Download Spark JobServer sources
	RUN wget https://github.com/spark-jobserver/spark-jobserver/archive/v${SJS_VERSION}.tar.gz
	RUN tar -zxf v${SJS_VERSION}.tar.gz

	# Cherry pick the PR #1008 to fix context info route when "context-per-jvm = true"
	RUN cd spark-jobserver-${SJS_VERSION} && \
	wget https://github.com/spark-jobserver/spark-jobserver/pull/1008.diff && \
	patch -p1 < 1008.diff

	# Compile Spark JobServer from sources. We run assembly in a dedicated
	# step because it's likely to fail, yet we don't want to loose the benefits
	# of Docker layers caching - given "sbt package" is very slow
	RUN cd spark-jobserver-${SJS_VERSION} && sbt clean update package
	RUN cd spark-jobserver-${SJS_VERSION} && sbt assembly

	# Install config files
	ADD config/emr-${EMR_VERSION}.conf spark-jobserver-${SJS_VERSION}/config/emr-${EMR_VERSION}.conf
	ADD config/emr-${EMR_VERSION}.sh spark-jobserver-${SJS_VERSION}/config/emr-${EMR_VERSION}.sh
	ADD config/shiro.ini spark-jobserver-${SJS_VERSION}/config/shiro.ini

	# Package
	RUN cd spark-jobserver-${SJS_VERSION} && ./bin/server_package.sh emr-${EMR_VERSION}

	# Rename the package, adding the EMR version
	RUN mv /tmp/job-server/job-server.tar.gz /tmp/job-server/spark-job-server-${SJS_VERSION}-emr-${EMR_VERSION}.tar.gz

	# Copy the tests jar into the job-server package output dir so that it's easier to pick up
	RUN cp spark-jobserver-${SJS_VERSION}/job-server-tests/target/scala-2.11/job-server-tests_2.11-${SJS_VERSION}.jar /tmp/job-server/spark-job-server-tests-${SJS_VERSION}.jar
	spark {
	# spark.master will be passed to each job's JobContext
	master = "yarn-client"

	# This needs to match SPARK_HOME for cluster SparkContexts to be created successfully
	home = "/usr/lib/spark"

	jobserver {
	port = 8090

	# Required when master = "yarn-client"
	context-per-jvm = true

	# Note: JobFileDAO is deprecated from v0.7.0 because of issues in
	# production and will be removed in future, now defaults to H2 file.
	jobdao = spark.jobserver.io.JobSqlDAO

	filedao {
	rootdir = /mnt/tmp/spark-jobserver/filedao/data
	}

	sqldao {
	# Slick database driver, full classpath
	slick-driver = slick.driver.H2Driver

	# JDBC driver, full classpath
	jdbc-driver = org.h2.Driver

	# Directory where default H2 driver stores its data. Only needed for H2.
	rootdir = /mnt/tmp/spark-jobserver/sqldao/data

	# Full JDBC URL / init string, along with username and password. Sorry, needs to match above.
	# Substitutions may be used to launch job-server, but leave it out here in the default or tests won't pass
	jdbc {
	url = "jdbc:h2:file:/mnt/tmp/spark-jobserver/sqldao/data/h2-db"
	user = ""
	password = ""
	}

	# DB connection pool settings
	dbcp {
	enabled = false
	}
	}
	}

	# Predefined Spark contexts
	contexts {}

	# Default context settings (if not overridden by the specific context)
	context-settings {
	num-cpu-cores = 1
	memory-per-node = 1G
	spark.executor.instances = 2

	# If you wish to pass any settings directly to the sparkConf as-is, add them here in passthrough,
	# such as hadoop connection settings that don't use the "spark." prefix
	passthrough {
	#es.nodes = "192.1.1.1"
	}
	}
	}
	#
	# Default values
	#

	appdir=${appdir:-}
	conffile=${conffile:-}

	if [ -z "$appdir" ]; then
	# Set to the directory where this script is located
	appdir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
	fi

	if [ -z "$conffile" ]; then
	# Set the config file to be located in the same directored of this
	# script, with the same name, but ending with .conf instead of .sh
	conffile="$appdir/emr-5.13.0.conf"
	fi

	#
	# Customizations
	#

	APP_USER=hadoop
	APP_GROUP=hadoop
	INSTALL_DIR=/mnt/lib/spark-jobserver
	LOG_DIR=/mnt/var/log/spark-jobserver
	PIDFILE=spark-jobserver.pid
	JOBSERVER_MEMORY=1G
	SPARK_VERSION=2.3.0
	SPARK_HOME=/usr/lib/spark
	SPARK_CONF_DIR=/etc/spark/conf
	HADOOP_CONF_DIR=/etc/hadoop/conf
	YARN_CONF_DIR=/etc/hadoop/conf
	SCALA_VERSION=2.11.8
	MANAGER_JAR_FILE="$appdir/spark-job-server.jar"
	MANAGER_CONF_FILE="$(basename $conffile)"
	MANAGER_EXTRA_JAVA_OPTIONS=
	MANAGER_EXTRA_SPARK_CONFS="spark.yarn.submit.waitAppCompletion=false\|spark.files=$appdir/log4jcluster.properties,$conffile"
	MANAGER_LOGGING_OPTS="-Dlog4j.configuration=log4j-cluster.properties"
	# Shiro is used for authentication. Authentication is disabled in our
	# cluster, so we don't need it, but the file needs to exist in order
	# to get server_package.sh run successfully.