Created
November 22, 2021 09:49
-
-
Save joshuarobinson/bf2ec163e82dfdbbea553878d05c8323 to your computer and use it in GitHub Desktop.
spark-s3a-dockerfile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
FROM openjdk:8-slim | |
# Variables that define which software versions to install. | |
ARG SPARK_VERSION | |
ARG HADOOP_VERSION=3.2.2 | |
# Install necessary libraries for running Spark. | |
# Install curl for the build process; will remove later. | |
RUN apt-get update && apt-get install -y build-essential curl procps python python3 python3-pip python3-setuptools --no-install-recommends \ | |
&& pip3 install wheel | |
RUN pip3 install avro-python3 jupyter matplotlib sklearn wheel \ | |
&& rm -rf /var/lib/apt/lists/* | |
# Download and install Spark. | |
ARG PACKAGE=spark-$SPARK_VERSION-bin-hadoop3.2 | |
RUN curl https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/$PACKAGE.tgz \ | |
| tar -xvz -C /opt/ \ | |
&& ln -s /opt/$PACKAGE /opt/spark | |
# Download and extract the Hadoop binary package. | |
RUN curl https://archive.apache.org/dist/hadoop/core/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz \ | |
| tar xvz -C /opt/ \ | |
&& ln -s /opt/hadoop-$HADOOP_VERSION /opt/hadoop \ | |
&& rm -r /opt/hadoop/share/doc | |
# Add S3a jars to the classpath using this hack. | |
# Note: HADOOP_OPTIONAL_TOOLS should have worked, but is not correctly picked up by Yarn apps. | |
RUN ln -s /opt/hadoop/share/hadoop/tools/lib/hadoop-aws* /opt/hadoop/share/hadoop/common/lib/ && \ | |
ln -s /opt/hadoop/share/hadoop/tools/lib/aws-java-sdk* /opt/hadoop/share/hadoop/common/lib/ | |
# Remove curl. | |
RUN apt-get --purge remove -y curl && apt-get autoremove -y | |
# Add 'spark' user so that this cluster is not run as root. | |
RUN groupadd -g 1080 spark && \ | |
useradd -r -m -u 1080 -g spark spark && \ | |
mkdir -p /opt/hadoop/logs && \ | |
chown -R -L spark /opt/hadoop && \ | |
chown -R -L spark /opt/spark && \ | |
chgrp -R -L spark /opt/hadoop && \ | |
chgrp -R -L spark /opt/spark | |
USER spark | |
WORKDIR /home/spark | |
# Set necessary environment variables. | |
ENV HADOOP_HOME="/opt/hadoop" | |
ENV SPARK_HOME="/opt/spark" | |
ENV PATH="/opt/spark/bin:/opt/hadoop/bin:${PATH}" | |
RUN echo "export SPARK_DIST_CLASSPATH=$(hadoop classpath)" >> /opt/spark/conf/spark-env.sh | |
ENTRYPOINT ["tail", "-f", "/dev/null"] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment