Skip to content

Instantly share code, notes, and snippets.

@jonnyli1125
Last active September 16, 2023 20:28
Show Gist options
  • Save jonnyli1125/ef506c0c68fd7481ffe63899820f9e9b to your computer and use it in GitHub Desktop.
Save jonnyli1125/ef506c0c68fd7481ffe63899820f9e9b to your computer and use it in GitHub Desktop.
One node Hadoop, Spark, Livy cluster setup with Docker compose that uses the official images for apache/hadoop and apache/spark.

One-node Spark cluster

This is a Docker compose definition of a Hadoop + Spark + Livy cluster with one node only. You can use it for development or testing purposes.

Example usage:

docker compose build
docker compose up -d

docker exec -it <container-name> /bin/bash

docker compose down
HADOOP_HOME=/opt/hadoop
CORE-SITE.XML_fs.default.name=hdfs://namenode
CORE-SITE.XML_fs.defaultFS=hdfs://namenode
HDFS-SITE.XML_dfs.namenode.rpc-address=namenode:8020
HDFS-SITE.XML_dfs.replication=1
HDFS-SITE.XML_dfs.permissions.enabled=false
MAPRED-SITE.XML_mapreduce.framework.name=yarn
MAPRED-SITE.XML_yarn.app.mapreduce.am.env=HADOOP_MAPRED_HOME=$HADOOP_HOME
MAPRED-SITE.XML_mapreduce.map.env=HADOOP_MAPRED_HOME=$HADOOP_HOME
MAPRED-SITE.XML_mapreduce.reduce.env=HADOOP_MAPRED_HOME=$HADOOP_HOME
YARN-SITE.XML_yarn.resourcemanager.hostname=resourcemanager
YARN-SITE.XML_yarn.nodemanager.pmem-check-enabled=false
YARN-SITE.XML_yarn.nodemanager.delete.debug-delay-sec=600
YARN-SITE.XML_yarn.nodemanager.vmem-check-enabled=false
YARN-SITE.XML_yarn.nodemanager.aux-services=mapreduce_shuffle
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.maximum-applications=10000
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.maximum-am-resource-percent=0.1
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.resource-calculator=org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.queues=default
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.capacity=100
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.user-limit-factor=1
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.maximum-capacity=100
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.state=RUNNING
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.acl_submit_applications=*
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.acl_administer_queue=*
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.node-locality-delay=40
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.queue-mappings=
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.queue-mappings-override.enable=false
version: "2"
services:
namenode:
image: apache/hadoop:3
hostname: namenode
command: ["hdfs", "namenode"]
ports:
- 9870:9870
env_file:
- ./config
environment:
ENSURE_NAMENODE_DIR: "/tmp/hadoop-root/dfs/name"
volumes:
- /opt/hadoop
datanode:
image: apache/hadoop:3
command: ["hdfs", "datanode"]
env_file:
- ./config
resourcemanager:
image: apache/hadoop:3
hostname: resourcemanager
command: ["yarn", "resourcemanager"]
ports:
- 8088:8088
env_file:
- ./config
nodemanager:
hostname: nodemanager
build:
context: .
dockerfile: nodemanager.Dockerfile
args:
HADOOP_IMAGE: apache/hadoop:3
image: nodemanager:3
command: ["yarn", "nodemanager"]
ports:
- 8042:8042
env_file:
- ./config
spark:
image: apache/spark:latest
entrypoint: ["/bin/bash", "-c"]
command: "\"/opt/spark/sbin/start-history-server.sh && tail -f /dev/null\""
ports:
- 18080:18080
environment:
HADOOP_HOME: /opt/hadoop/
HADOOP_CONF_DIR: /opt/hadoop/etc/hadoop
volumes:
- /opt/spark
volumes_from:
- namenode
depends_on:
- namenode
livy:
build:
context: .
dockerfile: livy.Dockerfile
args:
LIVY_VERSION: 0.7.1-incubating
image: livy:0.7.1-incubating
ports:
- 8998:8998
environment:
SPARK_HOME: /opt/spark
HADOOP_CONF_DIR: /opt/hadoop/etc/hadoop
SPARK_MASTER: yarn
SPARK_DEPLOY_MODE: cluster
PYSPARK_PYTHON: /opt/rh/rh-python38/root/usr/bin/python3
PYSPARK_DRIVER_PYTHON: /opt/rh/rh-python38/root/usr/bin/python3
volumes_from:
- spark
- namenode
depends_on:
- spark
- namenode
- nodemanager
networks:
cluster-dev_default:
driver: bridge
FROM eclipse-temurin:11-jre-focal
ARG LIVY_VERSION=0.7.1-incubating
ENV LIVY_HOME /opt/livy
ENV LIVY_CONF_DIR "${LIVY_HOME}/conf"
RUN curl --progress-bar -L --retry 3 \
"http://archive.apache.org/dist/incubator/livy/${LIVY_VERSION}/apache-livy-${LIVY_VERSION}-bin.zip" \
-o "./apache-livy-${LIVY_VERSION}-bin.zip"
RUN apt-get update && apt-get install unzip \
&& unzip -qq "./apache-livy-${LIVY_VERSION}-bin.zip" -d /opt \
&& mv "/opt/apache-livy-${LIVY_VERSION}-bin" "${LIVY_HOME}" \
&& rm -rf "./apache-livy-${LIVY_VERSION}-bin.zip" \
&& mkdir "${LIVY_HOME}/logs" \
&& chown -R root:root "${LIVY_HOME}"
COPY livy.entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]
#!/bin/bash
# Configure Livy based on environment variables
if [[ -n "${SPARK_MASTER}" ]]; then
echo "livy.spark.master=${SPARK_MASTER}" >> "${LIVY_CONF_DIR}/livy.conf"
fi
if [[ -n "${SPARK_DEPLOY_MODE}" ]]; then
echo "livy.spark.deploy-mode=${SPARK_DEPLOY_MODE}" >> "${LIVY_CONF_DIR}/livy.conf"
fi
if [[ -n "${LOCAL_DIR_WHITELIST}" ]]; then
echo "livy.file.local-dir-whitelist=${LOCAL_DIR_WHITELIST}" >> "${LIVY_CONF_DIR}/livy.conf"
fi
if [[ -n "${ENABLE_HIVE_CONTEXT}" ]]; then
echo "livy.repl.enable-hive-context=${ENABLE_HIVE_CONTEXT}" >> "${LIVY_CONF_DIR}/livy.conf"
fi
if [[ -n "${LIVY_HOST}" ]]; then
echo "livy.server.host=${LIVY_HOST}" >> "${LIVY_CONF_DIR}/livy.conf"
fi
if [[ -n "${LIVY_PORT}" ]]; then
echo "livy.server.port=${LIVY_PORT}" >> "${LIVY_CONF_DIR}/livy.conf"
fi
"$LIVY_HOME/bin/livy-server" $@
ARG HADOOP_IMAGE=apache/hadoop:3
FROM ${HADOOP_IMAGE}
# apache/hadoop image is built off of centos
RUN sudo yum install -y centos-release-scl-rh
RUN sudo yum install -y rh-python38
ENV PATH /opt/rh/rh-python38/root/usr/local/bin:/opt/rh/rh-python38/root/usr/bin:$PATH
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment