Created
February 20, 2020 16:19
-
-
Save simonvanderveldt/4159a0a6f248935be1698d6f8283c977 to your computer and use it in GitHub Desktop.
Pyspark Docker image
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This Dockerfile is a combination of kubernetes/dockerfiles/spark/Dockerfile and | |
# kubernetes/dockerfiles/spark/bindings/python/Dockerfile | |
FROM debian:buster-slim as builder | |
WORKDIR /tmp | |
RUN apt-get update && apt-get install -y wget | |
RUN wget http://ftp.nluug.nl/internet/apache/spark/spark-3.0.0-preview2/spark-3.0.0-preview2-bin-hadoop2.7.tgz && tar -zxvf spark-3.0.0-preview2-bin-hadoop2.7.tgz | |
# | |
# Licensed to the Apache Software Foundation (ASF) under one or more | |
# contributor license agreements. See the NOTICE file distributed with | |
# this work for additional information regarding copyright ownership. | |
# The ASF licenses this file to You under the Apache License, Version 2.0 | |
# (the "License"); you may not use this file except in compliance with | |
# the License. You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# | |
FROM openjdk:8-jdk-slim | |
ARG spark_uid=185 | |
# Before building the docker image, first build and make a Spark distribution following | |
# the instructions in http://spark.apache.org/docs/latest/building-spark.html. | |
# If this docker file is being used in the context of building your images from a Spark | |
# distribution, the docker build command should be invoked from the top level directory | |
# of the Spark distribution. E.g.: | |
# docker build -t spark:latest -f kubernetes/dockerfiles/spark/Dockerfile . | |
RUN set -ex && \ | |
apt-get update && \ | |
ln -s /lib /lib64 && \ | |
apt install -y bash tini libc6 libpam-modules krb5-user libnss3 && \ | |
mkdir -p /opt/spark && \ | |
mkdir -p /opt/spark/examples && \ | |
mkdir -p /opt/spark/work-dir && \ | |
touch /opt/spark/RELEASE && \ | |
rm /bin/sh && \ | |
ln -sv /bin/bash /bin/sh && \ | |
echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \ | |
chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \ | |
rm -rf /var/cache/apt/* | |
COPY --from=builder /tmp/spark-3.0.0-preview2-bin-hadoop2.7/jars /opt/spark/jars | |
COPY --from=builder /tmp/spark-3.0.0-preview2-bin-hadoop2.7/bin /opt/spark/bin | |
COPY --from=builder /tmp/spark-3.0.0-preview2-bin-hadoop2.7/sbin /opt/spark/sbin | |
COPY --from=builder /tmp/spark-3.0.0-preview2-bin-hadoop2.7/kubernetes/dockerfiles/spark/entrypoint.sh /opt/ | |
COPY --from=builder /tmp/spark-3.0.0-preview2-bin-hadoop2.7/examples /opt/spark/examples | |
COPY --from=builder /tmp/spark-3.0.0-preview2-bin-hadoop2.7/kubernetes/tests /opt/spark/tests | |
COPY --from=builder /tmp/spark-3.0.0-preview2-bin-hadoop2.7/data /opt/spark/data | |
ENV SPARK_HOME /opt/spark | |
WORKDIR /opt/spark/work-dir | |
RUN chmod g+w /opt/spark/work-dir | |
# PySpark | |
RUN mkdir ${SPARK_HOME}/python | |
# TODO: Investigate running both pip and pip3 via virtualenvs | |
RUN apt-get update && \ | |
apt install -y python python-pip && \ | |
apt install -y python3 python3-pip && \ | |
# We remove ensurepip since it adds no functionality since pip is | |
# installed on the image and it just takes up 1.6MB on the image | |
rm -r /usr/lib/python*/ensurepip && \ | |
pip install --upgrade pip setuptools && \ | |
# You may install with python3 packages by using pip3.6 | |
# Removed the .cache to save space | |
rm -r /root/.cache && rm -rf /var/cache/apt/* | |
COPY --from=builder /tmp/spark-3.0.0-preview2-bin-hadoop2.7/python/pyspark ${SPARK_HOME}/python/pyspark | |
COPY --from=builder /tmp/spark-3.0.0-preview2-bin-hadoop2.7/python/lib ${SPARK_HOME}/python/lib | |
ENTRYPOINT [ "/opt/entrypoint.sh" ] | |
# Specify the User that the actual main process will run as | |
USER ${spark_uid} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment