Skip to content

Instantly share code, notes, and snippets.

@avcaliani
Last active October 16, 2021 22:19
Show Gist options
  • Save avcaliani/d988386328d42289c65945ac322f323a to your computer and use it in GitHub Desktop.
Save avcaliani/d988386328d42289c65945ac322f323a to your computer and use it in GitHub Desktop.
🐳 Spark - Mini Cluster
version: '3'
services:
spark-master:
user: root
container_name: spark-master
build: .
image: spark
ports:
- "9090:8080"
- "7077:7077"
volumes:
- ./spark/jobs:/jobs
- ./DATALAKE:/DATALAKE
spark-worker:
user: root
container_name: spark-worker
build: .
image: spark
ports:
- "9091:8080"
- "7000:7000"
depends_on:
- spark-master
environment:
- SPARK_WORKLOAD=worker
- SPARK_LOCAL_IP=spark-worker
volumes:
- ./spark/jobs:/jobs
- ./DATALAKE:/DATALAKE
FROM python:3.9
ENV JAVA_HOME="/usr/lib/jvm/java-11-openjdk-amd64"
ENV SPARK_HOME="/opt/spark"
ENV SPARK_VERSION="3.1.2"
ENV HADOOP_VERSION="3.2"
ENV PATH="$SPARK_HOME/bin:$PATH"
ENV PYSPARK_PYTHON=python
ENV PATH="$SPARK_HOME/python:$PATH"
# Spark Config
ENV SPARK_WORKLOAD="master" \
SPARK_LOCAL_IP="spark-master" \
SPARK_LOG_DIR="/opt/spark/logs"
# Master Config
ENV SPARK_MASTER_PORT=7077 \
SPARK_MASTER_WEBUI_PORT=8080 \
SPARK_MASTER_LOG="/opt/spark/logs/spark-master.out"
# Worker Config
ENV SPARK_MASTER="spark://spark-master:7077" \
SPARK_WORKER_WEBUI_PORT=8080 \
SPARK_WORKER_PORT=7000 \
SPARK_WORKER_LOG="/opt/spark/logs/spark-worker.out" \
SPARK_WORKER_CORES=1 \
SPARK_WORKER_MEMORY="1G" \
SPARK_DRIVER_MEMORY="1G" \
SPARK_EXECUTOR_MEMORY="1G"
WORKDIR /opt
# Java
RUN echo "deb http://ftp.us.debian.org/debian sid main" >> /etc/apt/sources.list
RUN apt-get update && apt-get -y install gcc-10-base openjdk-11-jdk && apt-get -y autoremove
# Spark
ADD "https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz" .
RUN tar -xzf spark*.tgz && rm -f spark*.tgz && mv spark* spark
# Spark Config
ADD log4j.properties /opt/spark/conf/log4j.properties
ADD init-spark.sh /
# TODO: Check if this is necessary
RUN mkdir -p $SPARK_LOG_DIR \
&& touch $SPARK_MASTER_LOG \
&& touch $SPARK_WORKER_LOG \
&& ln -sf /dev/stdout $SPARK_MASTER_LOG \
&& ln -sf /dev/stdout $SPARK_WORKER_LOG
# TODO: Check if this is necessary
#EXPOSE 8080 7077 6066
WORKDIR /opt/spark
CMD ["/bin/bash", "/init-spark.sh"]
#!/bin/bash -xe
# Activating Spark environment variables
. "/opt/spark/bin/load-spark-env.sh"
# Master Node
if [ "$SPARK_WORKLOAD" == "master" ]; then
SPARK_MASTER_HOST=$(hostname)
export SPARK_MASTER_HOST
/opt/spark/bin/spark-class "org.apache.spark.deploy.master.Master" \
--ip "$SPARK_MASTER_HOST" \
--port "$SPARK_MASTER_PORT" \
--webui-port "$SPARK_MASTER_WEBUI_PORT" \
>>"$SPARK_MASTER_LOG"
# Worker Node
elif [ "$SPARK_WORKLOAD" == "worker" ]; then
/opt/spark/bin/spark-class "org.apache.spark.deploy.worker.Worker" \
--webui-port "$SPARK_WORKER_WEBUI_PORT" \
"$SPARK_MASTER" \
>>"$SPARK_WORKER_LOG"
else
echo "Undefined Workload Type $SPARK_WORKLOAD, must specify: master or worker"
fi
log4j.rootCategory=INFO, console
# Spark :: Log :: Console Appender
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yyyy/MM/dd HH:mm:ss} %p [%t] %c: %m%n
# Spark :: Log :: Spark Stuff
log4j.logger.org.apache.spark=WARN
log4j.logger.org.apache.hadoop=ERROR
log4j.logger.org.sparkproject.jetty=WARN
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment