Last active
September 1, 2022 08:35
-
-
Save ruslanmv/9518aa1113c48a9002266f7bd3b012a0 to your computer and use it in GitHub Desktop.
AWS Pyspark Docker Container
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright (c) Jupyter Development Team. | |
# Distributed under the terms of the Modified BSD License. | |
ARG OWNER=jupyter | |
ARG BASE_CONTAINER=$OWNER/scipy-notebook | |
FROM $BASE_CONTAINER | |
LABEL maintainer="Jupyter Project <jupyter@googlegroups.com>" | |
# Fix DL4006 | |
SHELL ["/bin/bash", "-o", "pipefail", "-c"] | |
USER root | |
# Spark dependencies | |
# Default values can be overridden at build time | |
# (ARGS are in lower case to distinguish them from ENV) | |
ARG spark_version="3.1.2" | |
ARG hadoop_version="3.2" | |
ARG spark_checksum="2385CB772F21B014CE2ABD6B8F5E815721580D6E8BC42A26D70BBCDDA8D303D886A6F12B36D40F6971B5547B70FAE62B5A96146F0421CB93D4E51491308EF5D5" | |
ARG openjdk_version="11" | |
ARG aws_hadoop_version="3.2.0" | |
ARG aws_sdk_version="1.11.375" | |
ENV APACHE_SPARK_VERSION="${spark_version}" \ | |
HADOOP_VERSION="${hadoop_version}" \ | |
AWS_SDK_VERSION="${aws_sdk_version}" \ | |
AWS_HADOOP_VERSION="${aws_hadoop_version}" | |
ENV JUPYTER_ENABLE_LAB=yes | |
RUN apt-get update --yes && \ | |
apt-get install --yes --no-install-recommends \ | |
"openjdk-${openjdk_version}-jre-headless" \ | |
ca-certificates-java \ | |
curl \ | |
awscli && \ | |
apt-get clean && rm -rf /var/lib/apt/lists/* | |
# Install requirements | |
COPY requirements.txt ./ | |
RUN pip3 install --upgrade pip && pip3 install --no-cache-dir -r requirements.txt | |
RUN apt-get clean && rm requirements.txt | |
# Spark installation | |
WORKDIR /tmp | |
RUN wget -q "https://archive.apache.org/dist/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \ | |
echo "${spark_checksum} *spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" | sha512sum -c - && \ | |
tar xzf "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" -C /usr/local --owner root --group root --no-same-owner && \ | |
rm "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" | |
WORKDIR /usr/local | |
# Configure Spark | |
ENV SPARK_HOME=/usr/local/spark | |
ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \ | |
PATH="${PATH}:${SPARK_HOME}/bin" | |
RUN ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" spark && \ | |
# Add a link in the before_notebook hook in order to source automatically PYTHONPATH | |
mkdir -p /usr/local/bin/before-notebook.d && \ | |
ln -s "${SPARK_HOME}/sbin/spark-config.sh" /usr/local/bin/before-notebook.d/spark-config.sh | |
# Fix Spark installation for Java 11 and Apache Arrow library | |
# see: https://github.com/apache/spark/pull/27356, https://spark.apache.org/docs/latest/#downloading | |
RUN cp -p "${SPARK_HOME}/conf/spark-defaults.conf.template" "${SPARK_HOME}/conf/spark-defaults.conf" && \ | |
echo 'spark.driver.extraJavaOptions -Dio.netty.tryReflectionSetAccessible=true' >> "${SPARK_HOME}/conf/spark-defaults.conf" && \ | |
echo 'spark.executor.extraJavaOptions -Dio.netty.tryReflectionSetAccessible=true' >> "${SPARK_HOME}/conf/spark-defaults.conf" | |
# Add hadoop-aws and aws-sdk | |
RUN wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${AWS_HADOOP_VERSION}/hadoop-aws-${AWS_HADOOP_VERSION}.jar -P "${SPARK_HOME}/jars/" && \ | |
wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar -P "${SPARK_HOME}/jars/" | |
# Adding JetS3t libary | |
RUN wget https://repo1.maven.org/maven2/net/java/dev/jets3t/jets3t/0.9.4/jets3t-0.9.4.jar -P "${SPARK_HOME}/jars/" | |
# Install pyarrow | |
RUN mamba install --quiet --yes \ | |
'pyarrow' && \ | |
mamba clean --all -f -y && \ | |
fix-permissions "${CONDA_DIR}" && \ | |
fix-permissions "/home/${NB_USER}" | |
USER ${NB_UID} | |
WORKDIR "${HOME}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
boto3 | |
s3fs | |
botocore |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment