simonvanderveldt/Dockerfile

## Dockerfile
# This Dockerfile is a combination of kubernetes/dockerfiles/spark/Dockerfile and
# kubernetes/dockerfiles/spark/bindings/python/Dockerfile

FROM debian:buster-slim as builder

WORKDIR /tmp
RUN apt-get update && apt-get install -y wget
RUN wget http://ftp.nluug.nl/internet/apache/spark/spark-3.0.0-preview2/spark-3.0.0-preview2-bin-hadoop2.7.tgz && tar -zxvf spark-3.0.0-preview2-bin-hadoop2.7.tgz


#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

FROM openjdk:8-jdk-slim

ARG spark_uid=185

# Before building the docker image, first build and make a Spark distribution following
# the instructions in http://spark.apache.org/docs/latest/building-spark.html.
# If this docker file is being used in the context of building your images from a Spark
# distribution, the docker build command should be invoked from the top level directory
# of the Spark distribution. E.g.:
# docker build -t spark:latest -f kubernetes/dockerfiles/spark/Dockerfile .

RUN set -ex && \
    apt-get update && \
    ln -s /lib /lib64 && \
    apt install -y bash tini libc6 libpam-modules krb5-user libnss3 && \
    mkdir -p /opt/spark && \
    mkdir -p /opt/spark/examples && \
    mkdir -p /opt/spark/work-dir && \
    touch /opt/spark/RELEASE && \
    rm /bin/sh && \
    ln -sv /bin/bash /bin/sh && \
    echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \
    chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \
    rm -rf /var/cache/apt/*

COPY --from=builder /tmp/spark-3.0.0-preview2-bin-hadoop2.7/jars /opt/spark/jars
COPY --from=builder /tmp/spark-3.0.0-preview2-bin-hadoop2.7/bin /opt/spark/bin
COPY --from=builder /tmp/spark-3.0.0-preview2-bin-hadoop2.7/sbin /opt/spark/sbin
COPY --from=builder /tmp/spark-3.0.0-preview2-bin-hadoop2.7/kubernetes/dockerfiles/spark/entrypoint.sh /opt/
COPY --from=builder /tmp/spark-3.0.0-preview2-bin-hadoop2.7/examples /opt/spark/examples
COPY --from=builder /tmp/spark-3.0.0-preview2-bin-hadoop2.7/kubernetes/tests /opt/spark/tests
COPY --from=builder /tmp/spark-3.0.0-preview2-bin-hadoop2.7/data /opt/spark/data

ENV SPARK_HOME /opt/spark

WORKDIR /opt/spark/work-dir
RUN chmod g+w /opt/spark/work-dir

# PySpark
RUN mkdir ${SPARK_HOME}/python
# TODO: Investigate running both pip and pip3 via virtualenvs
RUN apt-get update && \
    apt install -y python python-pip && \
    apt install -y python3 python3-pip && \
    # We remove ensurepip since it adds no functionality since pip is
    # installed on the image and it just takes up 1.6MB on the image
    rm -r /usr/lib/python*/ensurepip && \
    pip install --upgrade pip setuptools && \
    # You may install with python3 packages by using pip3.6
    # Removed the .cache to save space
    rm -r /root/.cache && rm -rf /var/cache/apt/*

COPY --from=builder /tmp/spark-3.0.0-preview2-bin-hadoop2.7/python/pyspark ${SPARK_HOME}/python/pyspark
COPY --from=builder /tmp/spark-3.0.0-preview2-bin-hadoop2.7/python/lib ${SPARK_HOME}/python/lib

ENTRYPOINT [ "/opt/entrypoint.sh" ]

# Specify the User that the actual main process will run as
USER ${spark_uid}
	# This Dockerfile is a combination of kubernetes/dockerfiles/spark/Dockerfile and
	# kubernetes/dockerfiles/spark/bindings/python/Dockerfile

	FROM debian:buster-slim as builder

	WORKDIR /tmp
	RUN apt-get update && apt-get install -y wget
	RUN wget http://ftp.nluug.nl/internet/apache/spark/spark-3.0.0-preview2/spark-3.0.0-preview2-bin-hadoop2.7.tgz && tar -zxvf spark-3.0.0-preview2-bin-hadoop2.7.tgz


	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	FROM openjdk:8-jdk-slim

	ARG spark_uid=185

	# Before building the docker image, first build and make a Spark distribution following
	# the instructions in http://spark.apache.org/docs/latest/building-spark.html.
	# If this docker file is being used in the context of building your images from a Spark
	# distribution, the docker build command should be invoked from the top level directory
	# of the Spark distribution. E.g.:
	# docker build -t spark:latest -f kubernetes/dockerfiles/spark/Dockerfile .

	RUN set -ex && \
	apt-get update && \
	ln -s /lib /lib64 && \
	apt install -y bash tini libc6 libpam-modules krb5-user libnss3 && \
	mkdir -p /opt/spark && \
	mkdir -p /opt/spark/examples && \
	mkdir -p /opt/spark/work-dir && \
	touch /opt/spark/RELEASE && \
	rm /bin/sh && \
	ln -sv /bin/bash /bin/sh && \
	echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \
	chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \
	rm -rf /var/cache/apt/*

	COPY --from=builder /tmp/spark-3.0.0-preview2-bin-hadoop2.7/jars /opt/spark/jars
	COPY --from=builder /tmp/spark-3.0.0-preview2-bin-hadoop2.7/bin /opt/spark/bin
	COPY --from=builder /tmp/spark-3.0.0-preview2-bin-hadoop2.7/sbin /opt/spark/sbin
	COPY --from=builder /tmp/spark-3.0.0-preview2-bin-hadoop2.7/kubernetes/dockerfiles/spark/entrypoint.sh /opt/
	COPY --from=builder /tmp/spark-3.0.0-preview2-bin-hadoop2.7/examples /opt/spark/examples
	COPY --from=builder /tmp/spark-3.0.0-preview2-bin-hadoop2.7/kubernetes/tests /opt/spark/tests
	COPY --from=builder /tmp/spark-3.0.0-preview2-bin-hadoop2.7/data /opt/spark/data

	ENV SPARK_HOME /opt/spark

	WORKDIR /opt/spark/work-dir
	RUN chmod g+w /opt/spark/work-dir

	# PySpark
	RUN mkdir ${SPARK_HOME}/python
	# TODO: Investigate running both pip and pip3 via virtualenvs
	RUN apt-get update && \
	apt install -y python python-pip && \
	apt install -y python3 python3-pip && \
	# We remove ensurepip since it adds no functionality since pip is
	# installed on the image and it just takes up 1.6MB on the image
	rm -r /usr/lib/python*/ensurepip && \
	pip install --upgrade pip setuptools && \
	# You may install with python3 packages by using pip3.6
	# Removed the .cache to save space
	rm -r /root/.cache && rm -rf /var/cache/apt/*

	COPY --from=builder /tmp/spark-3.0.0-preview2-bin-hadoop2.7/python/pyspark ${SPARK_HOME}/python/pyspark
	COPY --from=builder /tmp/spark-3.0.0-preview2-bin-hadoop2.7/python/lib ${SPARK_HOME}/python/lib

	ENTRYPOINT [ "/opt/entrypoint.sh" ]

	# Specify the User that the actual main process will run as
	USER ${spark_uid}