Gatsby-Lee/custom_emr_image.dockerfile

## custom_emr_image.dockerfile
##
# @note: To improve cache re-use in image build, adding JARs goes first.
#        If "python dependencies installation" ( step3, step4 ) goes first,
#            the adding JARs less likely use the intermediate cache image since step3 and step4 has changes and doesn't hit cache.
#
# references
# - https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/docker-custom-images-steps.html
# - https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/docker-custom-images-tag.html
# @note as of emr-6.9.0, the public ECR can be used "public.ecr.aws/emr-on-eks/spark/emr-6.9.0:latest"
##
# EMR 6.7.0  - Spark 3.2.1
# EMR 6.11.0 - Spark 3.3.2
# FAILED: decided to use the latest version of Spark 3.3.x to get the all available path for Spark 3.3.x
ARG EMR_IMAGE_VERSION=emr-6.7.0:latest
# FROM public.ecr.aws/emr-on-eks/spark/${EMR_IMAGE_VERSION}
FROM 895885662937.dkr.ecr.us-west-2.amazonaws.com/spark/${EMR_IMAGE_VERSION}

# step 0: preparation
USER root
# step 1: install os level pkg
RUN yum -y install zip

# step 2: Add required JARs
# @note: PERMISSION: writing to JAR_HOME requires root permission,
# @note: S3 protocol doesn't work with ADD. It should be changed to `virtual-host-style` OR `aws s3 cp`
# @note: These ARG can't be defined before "FROM". If so, then the values become empty.
#   - ref: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact
ARG JAR_HOME=/usr/lib/spark/jars/
ARG SPARK_VERSION=3.2.1
## REQUIRED to use Kafka as Data Source.
# spark version related.
# If not exists, "pyspark.errors.exceptions.captured.AnalysisException: Failed to find data source: kafka. Please deploy the application as per the deployment section of Structured Streaming + Kafka Integration Guide."
ADD https://repo1.maven.org/maven2/org/apache/spark/spark-sql-kafka-0-10_2.12/${SPARK_VERSION}/spark-sql-kafka-0-10_2.12-${SPARK_VERSION}.jar $JAR_HOME
ADD https://repo1.maven.org/maven2/org/apache/spark/spark-token-provider-kafka-0-10_2.12/${SPARK_VERSION}/spark-token-provider-kafka-0-10_2.12-${SPARK_VERSION}.jar $JAR_HOME
# Spark version is NOT related.
# - If not exists, "java.lang.NoClassDefFoundError: org/apache/commons/pool2/impl/GenericKeyedObjectPoolConfig"
ADD https://repo1.maven.org/maven2/org/apache/commons/commons-pool2/2.12.0/commons-pool2-2.12.0.jar $JAR_HOME
# - If not exists, "java.lang.NoClassDefFoundError: org/apache/kafka/common/serialization/ByteArraySerializer"
ADD https://repo1.maven.org/maven2/org/apache/kafka/kafka-clients/3.6.0/kafka-clients-3.6.0.jar $JAR_HOME
RUN chmod -R +r /usr/lib/spark/jars

# step 3: Switch back user to hadoop
USER hadoop:hadoop
	##
	# @note: To improve cache re-use in image build, adding JARs goes first.
	# If "python dependencies installation" ( step3, step4 ) goes first,
	# the adding JARs less likely use the intermediate cache image since step3 and step4 has changes and doesn't hit cache.
	#
	# references
	# - https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/docker-custom-images-steps.html
	# - https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/docker-custom-images-tag.html
	# @note as of emr-6.9.0, the public ECR can be used "public.ecr.aws/emr-on-eks/spark/emr-6.9.0:latest"
	##
	# EMR 6.7.0 - Spark 3.2.1
	# EMR 6.11.0 - Spark 3.3.2
	# FAILED: decided to use the latest version of Spark 3.3.x to get the all available path for Spark 3.3.x
	ARG EMR_IMAGE_VERSION=emr-6.7.0:latest
	# FROM public.ecr.aws/emr-on-eks/spark/${EMR_IMAGE_VERSION}
	FROM 895885662937.dkr.ecr.us-west-2.amazonaws.com/spark/${EMR_IMAGE_VERSION}

	# step 0: preparation
	USER root
	# step 1: install os level pkg
	RUN yum -y install zip

	# step 2: Add required JARs
	# @note: PERMISSION: writing to JAR_HOME requires root permission,
	# @note: S3 protocol doesn't work with ADD. It should be changed to `virtual-host-style` OR `aws s3 cp`
	# @note: These ARG can't be defined before "FROM". If so, then the values become empty.
	# - ref: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact
	ARG JAR_HOME=/usr/lib/spark/jars/
	ARG SPARK_VERSION=3.2.1
	## REQUIRED to use Kafka as Data Source.
	# spark version related.
	# If not exists, "pyspark.errors.exceptions.captured.AnalysisException: Failed to find data source: kafka. Please deploy the application as per the deployment section of Structured Streaming + Kafka Integration Guide."
	ADD https://repo1.maven.org/maven2/org/apache/spark/spark-sql-kafka-0-10_2.12/${SPARK_VERSION}/spark-sql-kafka-0-10_2.12-${SPARK_VERSION}.jar $JAR_HOME
	ADD https://repo1.maven.org/maven2/org/apache/spark/spark-token-provider-kafka-0-10_2.12/${SPARK_VERSION}/spark-token-provider-kafka-0-10_2.12-${SPARK_VERSION}.jar $JAR_HOME
	# Spark version is NOT related.
	# - If not exists, "java.lang.NoClassDefFoundError: org/apache/commons/pool2/impl/GenericKeyedObjectPoolConfig"
	ADD https://repo1.maven.org/maven2/org/apache/commons/commons-pool2/2.12.0/commons-pool2-2.12.0.jar $JAR_HOME
	# - If not exists, "java.lang.NoClassDefFoundError: org/apache/kafka/common/serialization/ByteArraySerializer"
	ADD https://repo1.maven.org/maven2/org/apache/kafka/kafka-clients/3.6.0/kafka-clients-3.6.0.jar $JAR_HOME
	RUN chmod -R +r /usr/lib/spark/jars

	# step 3: Switch back user to hadoop
	USER hadoop:hadoop