Skip to content

Instantly share code, notes, and snippets.

@aroraakshit
Created April 7, 2021 00:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aroraakshit/57f423836ca8798bdf51518e1800aae6 to your computer and use it in GitHub Desktop.
Save aroraakshit/57f423836ca8798bdf51518e1800aae6 to your computer and use it in GitHub Desktop.
Custom GPU init actions
#!/bin/bash
# Dataproc configurations
readonly HADOOP_CONF_DIR='/etc/hadoop/conf'
readonly HIVE_CONF_DIR='/etc/hive/conf'
readonly SPARK_CONF_DIR='/etc/spark/conf'
function execute_with_retries() {
local -r cmd=$1
for ((i = 0; i < 10; i++)); do
if eval "$cmd"; then
return 0
fi
sleep 5
done
return 1
}
function set_hadoop_property() {
local -r config_file=$1
local -r property=$2
local -r value=$3
bdconfig set_property \
--configuration_file "${HADOOP_CONF_DIR}/${config_file}" \
--name "${property}" --value "${value}" \
--clobber
}
function configure_yarn() {
if [[ ! -f ${HADOOP_CONF_DIR}/resource-types.xml ]]; then
printf '<?xml version="1.0" ?>\n<configuration/>' >"${HADOOP_CONF_DIR}/resource-types.xml"
fi
set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu'
set_hadoop_property 'capacity-scheduler.xml' \
'yarn.scheduler.capacity.resource-calculator' \
'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'
}
function configure_yarn_nodemanager() {
set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
set_hadoop_property 'yarn-site.xml' \
'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
set_hadoop_property 'yarn-site.xml' \
'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' '/usr/bin'
set_hadoop_property 'yarn-site.xml' \
'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true'
set_hadoop_property 'yarn-site.xml' \
'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup'
set_hadoop_property 'yarn-site.xml' \
'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn'
set_hadoop_property 'yarn-site.xml' \
'yarn.nodemanager.container-executor.class' \
'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor'
set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn'
# Fix local dirs access permissions
local yarn_local_dirs=()
readarray -d ',' yarn_local_dirs < <(bdconfig get_property_value \
--configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \
--name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n')
chown yarn:yarn -R "${yarn_local_dirs[@]/,/}"
}
function configure_gpu_exclusive_mode() {
# check if running spark 3, if not, enable GPU exclusive mode
local spark_version
spark_version=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)
if [[ ${spark_version} != 3.* ]]; then
# include exclusive mode on GPU
nvidia-smi -c EXCLUSIVE_PROCESS
fi
}
function configure_gpu_isolation() {
# Download GPU discovery script
local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu'
mkdir -p ${spark_gpu_script_dir}
local -r gpu_resources_url=https://raw.githubusercontent.com/apache/spark/master/examples/src/main/scripts/getGpusResources.sh
curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
"${gpu_resources_url}" -o ${spark_gpu_script_dir}/getGpusResources.sh
chmod a+rwx -R ${spark_gpu_script_dir}
# enable GPU isolation
sed -i "s/yarn.nodemanager\.linux\-container\-executor\.group\=/yarn\.nodemanager\.linux\-container\-executor\.group\=yarn/g" "${HADOOP_CONF_DIR}/container-executor.cfg"
printf '\n[gpu]\nmodule.enabled=true\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' >>"${HADOOP_CONF_DIR}/container-executor.cfg"
chmod a+rwx -R /sys/fs/cgroup/cpu,cpuacct
chmod a+rwx -R /sys/fs/cgroup/devices
}
function main() {
# This configuration should run on all nodes regardless of attached GPUs
configure_yarn
configure_yarn_nodemanager
configure_gpu_isolation
configure_gpu_exclusive_mode
}
main
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment