rluta/bootstrap_rapids.sh

## bootstrap_rapids.sh
#!/bin/bash

set -ex

bucket_path=$1
export CUDF_VERSION=${CUDF_VERSION:-21.08.2-cuda11}
export RAPIDS_VERSION=${RAPIDS_VERSION:-2.12-21.08.0}

echo "Give YARN authorization to manage devices"
sudo chmod a+rwx -R /sys/fs/cgroup/cpu,cpuacct
sudo chmod a+rwx -R /sys/fs/cgroup/devices

echo "Install the cuda-compat-11-2"
sudo yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
sudo yum clean all
sudo yum -y install cuda-toolkit-11-2 cuda-compat-11-2 openssl11

echo "Clean-up default EMR jars"
sudo rm -f /usr/lib/spark/jars/rapids-*.jar
sudo rm -f /usr/share/aws/emr/spark-rapids/lib/rapids-*.jar
sudo rm -f /usr/lib/spark/jars/cudf-*.jar
sudo rm -f /usr/share/aws/emr/spark-rapids/lib/cudf-*.jar
sudo mkdir -p /usr/share/aws/emr/spark-rapids/lib/
sudo mkdir -p /usr/lib/spark/jars/

echo "Install cuDF and Spark RAPIDS"
sudo aws s3 cp "${bucket_path}/cudf-${CUDF_VERSION}.jar" "/usr/share/aws/emr/spark-rapids/lib/cudf-${CUDF_VERSION}.jar"
sudo ln -s "/usr/share/aws/emr/spark-rapids/lib/cudf-${CUDF_VERSION}.jar" "/usr/lib/spark/jars/cudf-${CUDF_VERSION}.jar"
sudo aws s3 cp "${bucket_path}/rapids-4-spark_${RAPIDS_VERSION}.jar" "/usr/share/aws/emr/spark-rapids/lib/rapids-4-spark_${RAPIDS_VERSION}.jar"
sudo ln -s "/usr/share/aws/emr/spark-rapids/lib/rapids-4-spark_${RAPIDS_VERSION}.jar" "/usr/lib/spark/jars/rapids-4-spark_${RAPIDS_VERSION}.jar"

echo "Install gpu discovery scripts"
sudo mkdir -p /usr/lib/spark/scripts/gpu/
sudo aws s3 cp "${bucket_path}/getGpusResources.sh" /usr/lib/spark/scripts/gpu/getGpusResources.sh
sudo chmod +x /usr/lib/spark/scripts/gpu/getGpusResources.sh
echo "Done"

## emr-rapids-configuration.json

[
	{
		"Classification":"spark",
		"Properties":{
			"enableSparkRapids":"false"
		}
	},
	{
		"Classification":"yarn-site",
		"Properties":{
			"yarn.nodemanager.resource-plugins":"yarn.io/gpu",
			"yarn.resource-types":"yarn.io/gpu",
			"yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices":"auto",
			"yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables":"/usr/bin",
			"yarn.nodemanager.linux-container-executor.cgroups.mount":"true",
			"yarn.nodemanager.linux-container-executor.cgroups.mount-path":"/sys/fs/cgroup",
			"yarn.nodemanager.linux-container-executor.cgroups.hierarchy":"yarn",
			"yarn.nodemanager.container-executor.class":"org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor"
		}
	},
	{
		"Classification":"container-executor",
		"Properties":{

		},
		"Configurations":[
			{
				"Classification":"gpu",
				"Properties":{
					"module.enabled":"true"
				}
			},
			{
				"Classification":"cgroups",
				"Properties":{
					"root":"/sys/fs/cgroup",
					"yarn-hierarchy":"yarn"
				}
			}
		]
	},
	{
		"Classification":"capacity-scheduler",
		"Properties":{
			"yarn.scheduler.capacity.resource-calculator":"org.apache.hadoop.yarn.util.resource.DominantResourceCalculator"
		}
	},
	{
		"Classification":"spark-defaults",
		"Properties":{
			"spark.plugins":"com.nvidia.spark.SQLPlugin",
      			"spark.rapids.sql.enabled":"true",
			"spark.executor.resource.gpu.discoveryScript":"/usr/lib/spark/scripts/gpu/getGpusResources.sh",
			"spark.executor.extraLibraryPath":"/usr/local/cuda-11.2/targets/x86_64-linux/lib:/usr/local/cuda-11.2/extras/CUPTI/lib64:/usr/local/cuda-11.2/compat/:/usr/local/cuda-11.2/lib:/usr/local/cuda-11.2/lib64:/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop/lib/native",
		        "spark.rapids.shims-provider-override": "com.nvidia.spark.rapids.shims.spark311.SparkShimServiceProvider",
      			"spark.kryo.registrator":"com.nvidia.spark.rapids.GpuKryoRegistrator",
			"spark.rapids.sql.concurrentGpuTasks":"2",
			"spark.executor.resource.gpu.amount":"1",
			"spark.executor.cores":"4",
      			"spark.executor.memory":"9000M",
			"spark.task.cpus ":"1",
			"spark.task.resource.gpu.amount":"0.25",
			"spark.rapids.memory.pinnedPool.size":"0",
			"spark.executor.memoryOverhead":"2G",
			"spark.sql.sources.useV1SourceList":"",
			"spark.sql.sources.ignoreDataLocality.enabled":"true",
			"spark.sql.files.maxPartitionBytes":"402653184",
      			"spark.rapids.sql.incomaptibleOps.enabled":"true",
      			"spark.rapids.sql.decimalType.enabled":"true"
		}
	}
]


## getGpusResources.sh
#!/usr/bin/env bash

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# This script is a basic example script to get resource information about NVIDIA GPUs.
# It assumes the drivers are properly installed and the nvidia-smi command is available.
# It is not guaranteed to work on all setups so please test and customize as needed
# for your environment. It can be passed into SPARK via the config
# spark.{driver/executor}.resource.gpu.discoveryScript to allow the driver or executor to discover
# the GPUs it was allocated. It assumes you are running within an isolated container where the
# GPUs are allocated exclusively to that driver or executor.
# It outputs a JSON formatted string that is expected by the
# spark.{driver/executor}.resource.gpu.discoveryScript config.
#
# Example output: {"name": "gpu", "addresses":["0","1","2","3","4","5","6","7"]}

ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | sed -e ':a' -e 'N' -e'$!ba' -e 's/\n/","/g'`)
echo {\"name\": \"gpu\", \"addresses\":[\"$ADDRS\"]}

## start_cluster.sh
#!/bin/bash

id=(openssl rand --hex 8)
bucket_uri=s3://bootstrap-emr-$id

aws s3 mb ${bcket_uri} --region eu-west-1

wget https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/21.08.0/rapids-4-spark_2.12-21.08.0.jar
wget https://repo1.maven.org/maven2/ai/rapids/cudf/21.08.2/cudf-21.08.2-cuda11.jar

aws s3 sync . ${bucket_uri}/

aws emr create-cluster \
--release-label emr-6.3.0 \
--applications Name=Hadoop Name=Spark \
--service-role EMR_DefaultRole \
--ec2-attributes KeyName=my-key-pair,InstanceProfile=EMR_EC2_DefaultRole \
--instance-groups InstanceGroupType=MASTER,InstanceCount=1,InstanceType=m5a.2xlarge \
                  InstanceGroupType=CORE,InstanceCount=1,InstanceType=g4dn.2xlarge \
                  InstanceGroupType=TASK,InstanceCount=1,InstanceType=g4dn.xlarge \
--configurations file:///emr-rapids-configuration.json \
--bootstrap-actions Name='Rapids 21.08.0 Bootstrap action',Path=${bucket_uri}/bootstrap-rapids.sh
	#!/bin/bash

	set -ex

	bucket_path=$1
	export CUDF_VERSION=${CUDF_VERSION:-21.08.2-cuda11}
	export RAPIDS_VERSION=${RAPIDS_VERSION:-2.12-21.08.0}

	echo "Give YARN authorization to manage devices"
	sudo chmod a+rwx -R /sys/fs/cgroup/cpu,cpuacct
	sudo chmod a+rwx -R /sys/fs/cgroup/devices

	echo "Install the cuda-compat-11-2"
	sudo yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
	sudo yum clean all
	sudo yum -y install cuda-toolkit-11-2 cuda-compat-11-2 openssl11

	echo "Clean-up default EMR jars"
	sudo rm -f /usr/lib/spark/jars/rapids-*.jar
	sudo rm -f /usr/share/aws/emr/spark-rapids/lib/rapids-*.jar
	sudo rm -f /usr/lib/spark/jars/cudf-*.jar
	sudo rm -f /usr/share/aws/emr/spark-rapids/lib/cudf-*.jar
	sudo mkdir -p /usr/share/aws/emr/spark-rapids/lib/
	sudo mkdir -p /usr/lib/spark/jars/

	echo "Install cuDF and Spark RAPIDS"
	sudo aws s3 cp "${bucket_path}/cudf-${CUDF_VERSION}.jar" "/usr/share/aws/emr/spark-rapids/lib/cudf-${CUDF_VERSION}.jar"
	sudo ln -s "/usr/share/aws/emr/spark-rapids/lib/cudf-${CUDF_VERSION}.jar" "/usr/lib/spark/jars/cudf-${CUDF_VERSION}.jar"
	sudo aws s3 cp "${bucket_path}/rapids-4-spark_${RAPIDS_VERSION}.jar" "/usr/share/aws/emr/spark-rapids/lib/rapids-4-spark_${RAPIDS_VERSION}.jar"
	sudo ln -s "/usr/share/aws/emr/spark-rapids/lib/rapids-4-spark_${RAPIDS_VERSION}.jar" "/usr/lib/spark/jars/rapids-4-spark_${RAPIDS_VERSION}.jar"

	echo "Install gpu discovery scripts"
	sudo mkdir -p /usr/lib/spark/scripts/gpu/
	sudo aws s3 cp "${bucket_path}/getGpusResources.sh" /usr/lib/spark/scripts/gpu/getGpusResources.sh
	sudo chmod +x /usr/lib/spark/scripts/gpu/getGpusResources.sh
	echo "Done"

	[
	{
	"Classification":"spark",
	"Properties":{
	"enableSparkRapids":"false"
	}
	},
	{
	"Classification":"yarn-site",
	"Properties":{
	"yarn.nodemanager.resource-plugins":"yarn.io/gpu",
	"yarn.resource-types":"yarn.io/gpu",
	"yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices":"auto",
	"yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables":"/usr/bin",
	"yarn.nodemanager.linux-container-executor.cgroups.mount":"true",
	"yarn.nodemanager.linux-container-executor.cgroups.mount-path":"/sys/fs/cgroup",
	"yarn.nodemanager.linux-container-executor.cgroups.hierarchy":"yarn",
	"yarn.nodemanager.container-executor.class":"org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor"
	}
	},
	{
	"Classification":"container-executor",
	"Properties":{

	},
	"Configurations":[
	{
	"Classification":"gpu",
	"Properties":{
	"module.enabled":"true"
	}
	},
	{
	"Classification":"cgroups",
	"Properties":{
	"root":"/sys/fs/cgroup",
	"yarn-hierarchy":"yarn"
	}
	}
	]
	},
	{
	"Classification":"capacity-scheduler",
	"Properties":{
	"yarn.scheduler.capacity.resource-calculator":"org.apache.hadoop.yarn.util.resource.DominantResourceCalculator"
	}
	},
	{
	"Classification":"spark-defaults",
	"Properties":{
	"spark.plugins":"com.nvidia.spark.SQLPlugin",
	"spark.rapids.sql.enabled":"true",
	"spark.executor.resource.gpu.discoveryScript":"/usr/lib/spark/scripts/gpu/getGpusResources.sh",
	"spark.executor.extraLibraryPath":"/usr/local/cuda-11.2/targets/x86_64-linux/lib:/usr/local/cuda-11.2/extras/CUPTI/lib64:/usr/local/cuda-11.2/compat/:/usr/local/cuda-11.2/lib:/usr/local/cuda-11.2/lib64:/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop/lib/native",
	"spark.rapids.shims-provider-override": "com.nvidia.spark.rapids.shims.spark311.SparkShimServiceProvider",
	"spark.kryo.registrator":"com.nvidia.spark.rapids.GpuKryoRegistrator",
	"spark.rapids.sql.concurrentGpuTasks":"2",
	"spark.executor.resource.gpu.amount":"1",
	"spark.executor.cores":"4",
	"spark.executor.memory":"9000M",
	"spark.task.cpus ":"1",
	"spark.task.resource.gpu.amount":"0.25",
	"spark.rapids.memory.pinnedPool.size":"0",
	"spark.executor.memoryOverhead":"2G",
	"spark.sql.sources.useV1SourceList":"",
	"spark.sql.sources.ignoreDataLocality.enabled":"true",
	"spark.sql.files.maxPartitionBytes":"402653184",
	"spark.rapids.sql.incomaptibleOps.enabled":"true",
	"spark.rapids.sql.decimalType.enabled":"true"
	}
	}
	]
	#!/usr/bin/env bash

	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	# This script is a basic example script to get resource information about NVIDIA GPUs.
	# It assumes the drivers are properly installed and the nvidia-smi command is available.
	# It is not guaranteed to work on all setups so please test and customize as needed
	# for your environment. It can be passed into SPARK via the config
	# spark.{driver/executor}.resource.gpu.discoveryScript to allow the driver or executor to discover
	# the GPUs it was allocated. It assumes you are running within an isolated container where the
	# GPUs are allocated exclusively to that driver or executor.
	# It outputs a JSON formatted string that is expected by the
	# spark.{driver/executor}.resource.gpu.discoveryScript config.
	#
	# Example output: {"name": "gpu", "addresses":["0","1","2","3","4","5","6","7"]}

	ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader \| sed -e ':a' -e 'N' -e'$!ba' -e 's/\n/","/g'`)
	echo {\"name\": \"gpu\", \"addresses\":[\"$ADDRS\"]}
	#!/bin/bash

	id=(openssl rand --hex 8)
	bucket_uri=s3://bootstrap-emr-$id

	aws s3 mb ${bcket_uri} --region eu-west-1

	wget https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/21.08.0/rapids-4-spark_2.12-21.08.0.jar
	wget https://repo1.maven.org/maven2/ai/rapids/cudf/21.08.2/cudf-21.08.2-cuda11.jar

	aws s3 sync . ${bucket_uri}/

	aws emr create-cluster \
	--release-label emr-6.3.0 \
	--applications Name=Hadoop Name=Spark \
	--service-role EMR_DefaultRole \
	--ec2-attributes KeyName=my-key-pair,InstanceProfile=EMR_EC2_DefaultRole \
	--instance-groups InstanceGroupType=MASTER,InstanceCount=1,InstanceType=m5a.2xlarge \
	InstanceGroupType=CORE,InstanceCount=1,InstanceType=g4dn.2xlarge \
	InstanceGroupType=TASK,InstanceCount=1,InstanceType=g4dn.xlarge \
	--configurations file:///emr-rapids-configuration.json \
	--bootstrap-actions Name='Rapids 21.08.0 Bootstrap action',Path=${bucket_uri}/bootstrap-rapids.sh