Skip to content

Instantly share code, notes, and snippets.

@rluta
Last active September 30, 2021 15:00
Show Gist options
  • Save rluta/a228c26d04afea491950c976470eac79 to your computer and use it in GitHub Desktop.
Save rluta/a228c26d04afea491950c976470eac79 to your computer and use it in GitHub Desktop.
EMR 6.3 Bootstrap script for Spark RAPIDS plugin
#!/bin/bash
set -ex
bucket_path=$1
export CUDF_VERSION=${CUDF_VERSION:-21.08.2-cuda11}
export RAPIDS_VERSION=${RAPIDS_VERSION:-2.12-21.08.0}
echo "Give YARN authorization to manage devices"
sudo chmod a+rwx -R /sys/fs/cgroup/cpu,cpuacct
sudo chmod a+rwx -R /sys/fs/cgroup/devices
echo "Install the cuda-compat-11-2"
sudo yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
sudo yum clean all
sudo yum -y install cuda-toolkit-11-2 cuda-compat-11-2 openssl11
echo "Clean-up default EMR jars"
sudo rm -f /usr/lib/spark/jars/rapids-*.jar
sudo rm -f /usr/share/aws/emr/spark-rapids/lib/rapids-*.jar
sudo rm -f /usr/lib/spark/jars/cudf-*.jar
sudo rm -f /usr/share/aws/emr/spark-rapids/lib/cudf-*.jar
sudo mkdir -p /usr/share/aws/emr/spark-rapids/lib/
sudo mkdir -p /usr/lib/spark/jars/
echo "Install cuDF and Spark RAPIDS"
sudo aws s3 cp "${bucket_path}/cudf-${CUDF_VERSION}.jar" "/usr/share/aws/emr/spark-rapids/lib/cudf-${CUDF_VERSION}.jar"
sudo ln -s "/usr/share/aws/emr/spark-rapids/lib/cudf-${CUDF_VERSION}.jar" "/usr/lib/spark/jars/cudf-${CUDF_VERSION}.jar"
sudo aws s3 cp "${bucket_path}/rapids-4-spark_${RAPIDS_VERSION}.jar" "/usr/share/aws/emr/spark-rapids/lib/rapids-4-spark_${RAPIDS_VERSION}.jar"
sudo ln -s "/usr/share/aws/emr/spark-rapids/lib/rapids-4-spark_${RAPIDS_VERSION}.jar" "/usr/lib/spark/jars/rapids-4-spark_${RAPIDS_VERSION}.jar"
echo "Install gpu discovery scripts"
sudo mkdir -p /usr/lib/spark/scripts/gpu/
sudo aws s3 cp "${bucket_path}/getGpusResources.sh" /usr/lib/spark/scripts/gpu/getGpusResources.sh
sudo chmod +x /usr/lib/spark/scripts/gpu/getGpusResources.sh
echo "Done"
[
{
"Classification":"spark",
"Properties":{
"enableSparkRapids":"false"
}
},
{
"Classification":"yarn-site",
"Properties":{
"yarn.nodemanager.resource-plugins":"yarn.io/gpu",
"yarn.resource-types":"yarn.io/gpu",
"yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices":"auto",
"yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables":"/usr/bin",
"yarn.nodemanager.linux-container-executor.cgroups.mount":"true",
"yarn.nodemanager.linux-container-executor.cgroups.mount-path":"/sys/fs/cgroup",
"yarn.nodemanager.linux-container-executor.cgroups.hierarchy":"yarn",
"yarn.nodemanager.container-executor.class":"org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor"
}
},
{
"Classification":"container-executor",
"Properties":{
},
"Configurations":[
{
"Classification":"gpu",
"Properties":{
"module.enabled":"true"
}
},
{
"Classification":"cgroups",
"Properties":{
"root":"/sys/fs/cgroup",
"yarn-hierarchy":"yarn"
}
}
]
},
{
"Classification":"capacity-scheduler",
"Properties":{
"yarn.scheduler.capacity.resource-calculator":"org.apache.hadoop.yarn.util.resource.DominantResourceCalculator"
}
},
{
"Classification":"spark-defaults",
"Properties":{
"spark.plugins":"com.nvidia.spark.SQLPlugin",
"spark.rapids.sql.enabled":"true",
"spark.executor.resource.gpu.discoveryScript":"/usr/lib/spark/scripts/gpu/getGpusResources.sh",
"spark.executor.extraLibraryPath":"/usr/local/cuda-11.2/targets/x86_64-linux/lib:/usr/local/cuda-11.2/extras/CUPTI/lib64:/usr/local/cuda-11.2/compat/:/usr/local/cuda-11.2/lib:/usr/local/cuda-11.2/lib64:/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop/lib/native",
"spark.rapids.shims-provider-override": "com.nvidia.spark.rapids.shims.spark311.SparkShimServiceProvider",
"spark.kryo.registrator":"com.nvidia.spark.rapids.GpuKryoRegistrator",
"spark.rapids.sql.concurrentGpuTasks":"2",
"spark.executor.resource.gpu.amount":"1",
"spark.executor.cores":"4",
"spark.executor.memory":"9000M",
"spark.task.cpus ":"1",
"spark.task.resource.gpu.amount":"0.25",
"spark.rapids.memory.pinnedPool.size":"0",
"spark.executor.memoryOverhead":"2G",
"spark.sql.sources.useV1SourceList":"",
"spark.sql.sources.ignoreDataLocality.enabled":"true",
"spark.sql.files.maxPartitionBytes":"402653184",
"spark.rapids.sql.incomaptibleOps.enabled":"true",
"spark.rapids.sql.decimalType.enabled":"true"
}
}
]
#!/usr/bin/env bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This script is a basic example script to get resource information about NVIDIA GPUs.
# It assumes the drivers are properly installed and the nvidia-smi command is available.
# It is not guaranteed to work on all setups so please test and customize as needed
# for your environment. It can be passed into SPARK via the config
# spark.{driver/executor}.resource.gpu.discoveryScript to allow the driver or executor to discover
# the GPUs it was allocated. It assumes you are running within an isolated container where the
# GPUs are allocated exclusively to that driver or executor.
# It outputs a JSON formatted string that is expected by the
# spark.{driver/executor}.resource.gpu.discoveryScript config.
#
# Example output: {"name": "gpu", "addresses":["0","1","2","3","4","5","6","7"]}
ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | sed -e ':a' -e 'N' -e'$!ba' -e 's/\n/","/g'`)
echo {\"name\": \"gpu\", \"addresses\":[\"$ADDRS\"]}
#!/bin/bash
id=(openssl rand --hex 8)
bucket_uri=s3://bootstrap-emr-$id
aws s3 mb ${bcket_uri} --region eu-west-1
wget https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/21.08.0/rapids-4-spark_2.12-21.08.0.jar
wget https://repo1.maven.org/maven2/ai/rapids/cudf/21.08.2/cudf-21.08.2-cuda11.jar
aws s3 sync . ${bucket_uri}/
aws emr create-cluster \
--release-label emr-6.3.0 \
--applications Name=Hadoop Name=Spark \
--service-role EMR_DefaultRole \
--ec2-attributes KeyName=my-key-pair,InstanceProfile=EMR_EC2_DefaultRole \
--instance-groups InstanceGroupType=MASTER,InstanceCount=1,InstanceType=m5a.2xlarge \
InstanceGroupType=CORE,InstanceCount=1,InstanceType=g4dn.2xlarge \
InstanceGroupType=TASK,InstanceCount=1,InstanceType=g4dn.xlarge \
--configurations file:///emr-rapids-configuration.json \
--bootstrap-actions Name='Rapids 21.08.0 Bootstrap action',Path=${bucket_uri}/bootstrap-rapids.sh
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment