Created
April 7, 2021 00:41
-
-
Save aroraakshit/57f423836ca8798bdf51518e1800aae6 to your computer and use it in GitHub Desktop.
Custom GPU init actions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Dataproc configurations | |
readonly HADOOP_CONF_DIR='/etc/hadoop/conf' | |
readonly HIVE_CONF_DIR='/etc/hive/conf' | |
readonly SPARK_CONF_DIR='/etc/spark/conf' | |
function execute_with_retries() { | |
local -r cmd=$1 | |
for ((i = 0; i < 10; i++)); do | |
if eval "$cmd"; then | |
return 0 | |
fi | |
sleep 5 | |
done | |
return 1 | |
} | |
function set_hadoop_property() { | |
local -r config_file=$1 | |
local -r property=$2 | |
local -r value=$3 | |
bdconfig set_property \ | |
--configuration_file "${HADOOP_CONF_DIR}/${config_file}" \ | |
--name "${property}" --value "${value}" \ | |
--clobber | |
} | |
function configure_yarn() { | |
if [[ ! -f ${HADOOP_CONF_DIR}/resource-types.xml ]]; then | |
printf '<?xml version="1.0" ?>\n<configuration/>' >"${HADOOP_CONF_DIR}/resource-types.xml" | |
fi | |
set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu' | |
set_hadoop_property 'capacity-scheduler.xml' \ | |
'yarn.scheduler.capacity.resource-calculator' \ | |
'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator' | |
set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu' | |
} | |
function configure_yarn_nodemanager() { | |
set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu' | |
set_hadoop_property 'yarn-site.xml' \ | |
'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto' | |
set_hadoop_property 'yarn-site.xml' \ | |
'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' '/usr/bin' | |
set_hadoop_property 'yarn-site.xml' \ | |
'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true' | |
set_hadoop_property 'yarn-site.xml' \ | |
'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup' | |
set_hadoop_property 'yarn-site.xml' \ | |
'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn' | |
set_hadoop_property 'yarn-site.xml' \ | |
'yarn.nodemanager.container-executor.class' \ | |
'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor' | |
set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn' | |
# Fix local dirs access permissions | |
local yarn_local_dirs=() | |
readarray -d ',' yarn_local_dirs < <(bdconfig get_property_value \ | |
--configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \ | |
--name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n') | |
chown yarn:yarn -R "${yarn_local_dirs[@]/,/}" | |
} | |
function configure_gpu_exclusive_mode() { | |
# check if running spark 3, if not, enable GPU exclusive mode | |
local spark_version | |
spark_version=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) | |
if [[ ${spark_version} != 3.* ]]; then | |
# include exclusive mode on GPU | |
nvidia-smi -c EXCLUSIVE_PROCESS | |
fi | |
} | |
function configure_gpu_isolation() { | |
# Download GPU discovery script | |
local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu' | |
mkdir -p ${spark_gpu_script_dir} | |
local -r gpu_resources_url=https://raw.githubusercontent.com/apache/spark/master/examples/src/main/scripts/getGpusResources.sh | |
curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ | |
"${gpu_resources_url}" -o ${spark_gpu_script_dir}/getGpusResources.sh | |
chmod a+rwx -R ${spark_gpu_script_dir} | |
# enable GPU isolation | |
sed -i "s/yarn.nodemanager\.linux\-container\-executor\.group\=/yarn\.nodemanager\.linux\-container\-executor\.group\=yarn/g" "${HADOOP_CONF_DIR}/container-executor.cfg" | |
printf '\n[gpu]\nmodule.enabled=true\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' >>"${HADOOP_CONF_DIR}/container-executor.cfg" | |
chmod a+rwx -R /sys/fs/cgroup/cpu,cpuacct | |
chmod a+rwx -R /sys/fs/cgroup/devices | |
} | |
function main() { | |
# This configuration should run on all nodes regardless of attached GPUs | |
configure_yarn | |
configure_yarn_nodemanager | |
configure_gpu_isolation | |
configure_gpu_exclusive_mode | |
} | |
main |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment