Skip to content

Instantly share code, notes, and snippets.

@rluta
Last active April 10, 2022 18:35
Show Gist options
  • Save rluta/043ecf7a9079285de996571e85e3a65c to your computer and use it in GitHub Desktop.
Save rluta/043ecf7a9079285de996571e85e3a65c to your computer and use it in GitHub Desktop.
Set-up pyspark + rapids
#!/bin/bash --login
# To work properly, script must be run on an instance with the following pre-requisite:
# - a Nvidia GPU (Turing or more recent)
# - an adequate GPU kernel driver
# - a miniconda installation
# - a JDK installed (8 or 11)
set -ex
RAPIDS_VERSION=${RAPIDS_VERSION:-22.02.0}
CUDF_VERSION=${CUDF_VERSION:-22.02.0}
function install_conda_env() {
name=${1:-rapids}
conda create -n "$name" -y -c rapidsai -c nvidia -c conda-forge -c defaults python=3.8 pip cudatoolkit=11.5 pyspark=3.2.1
CONDA_PREFIX=$(conda env list | grep "^$name" | tr '*' ' ' | awk '{print $2}')
mkdir -p "$CONDA_PREFIX/etc/conda/activate.d/"
echo "export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib:\$LD_LIBRARY_PATH" > ${CONDA_PREFIX}/etc/conda/activate.d/cuda.sh
}
function install_rapids() {
target_dir=${1:-$HOME}
pushd "$target_dir"
rm -f "rapids-4-spark_2.12-${RAPIDS_VERSION}.jar" "cudf-${CUDF_VERSION}-cuda11.jar"
wget "https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${RAPIDS_VERSION}/rapids-4-spark_2.12-${RAPIDS_VERSION}.jar"
wget "https://repo1.maven.org/maven2/ai/rapids/cudf/${CUDF_VERSION}/cudf-${CUDF_VERSION}-cuda11.jar"
popd
}
function install_rapids_tools() {
target_dir=${1:-$HOME}
conda_env=${2:-rapids}
CONDA_PREFIX=$(conda env list | grep "^$conda_env" | tr '*' ' ' | awk '{print $2}')
pushd "$target_dir"
rm -f "rapids-4-spark-tools_2.12-${RAPIDS_VERSION}.jar"
wget "https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-tools_2.12/${RAPIDS_VERSION}/rapids-4-spark-tools_2.12-${RAPIDS_VERSION}.jar"
cat >${CONDA_PREFIX}/etc/conda/activate.d/rapids.sh <<EOF
alias rapids_qualify="java -cp ${target_dir}/rapids-4-spark-tools_2.12-${RAPIDS_VERSION}.jar:\$SPARK_HOME/jars/* com.nvidia.spark.rapids.tool.qualification.QualificationMain"
alias rapids_profile="java -cp ${target_dir}/rapids-4-spark-tools_2.12-${RAPIDS_VERSION}.jar:\$SPARK_HOME/jars/* com.nvidia.spark.rapids.tool.profiling.ProfileMain"
EOF
popd
}
function install_spark_conf() {
target_dir=${1:-${HOME}}
target_conf_dir="${target_dir}/conf"
target_logs_dir="${target_dir}/logs"
conda_env=${2:-rapids}
mkdir -p "$target_conf_dir" "$target_logs_dir"
cat >"${target_conf_dir}/spark-defaults.conf" <<EOF
# Example config options for Spark Rapids
# Basic enablers
spark.jars ${target_dir}/rapids-4-spark_2.12-${RAPIDS_VERSION}.jar,${target_dir}/cudf-${CUDF_VERSION}-cuda11.jar
spark.plugins com.nvidia.spark.SQLPlugin
spark.rapids.sql.enabled true
spark.rapids.sql.explain ALL
# Spark 3 and Rapids tuning
# 384M: 402653184
# 512M: 536870912
# 1G: 1073741824
# 1.5G: 1610612736
# 1.8G: 1932735284
# 2G: 2147483648
spark.sql.sources.useV1SourceList
spark.rapids.sql.concurrentGpuTasks 2
spark.rapids.sql.batchSizeBytes 1932735284
spark.sql.files.maxPartitionBytes 402653184
# Needs to be adjusted based on real available memory
spark.rapids.memory.pinnedPool.size 2G
spark.rapids.memory.host.spillStorageSize 2G
# Enable event logs collection for profiling
spark.eventLog.enabled true
spark.eventLog.dir ${target_logs_dir}
spark.history.fs.logDirectory ${target_logs_dir}
# AQE settings
spark.sql.adaptive.enabled true
spark.sql.adaptive.advisoryPartitionSizeInBytes 384M
spark.sql.adaptive.coalescePartitions.initialPartitionNum 512
# Rapids compatibility options
# Precentiles, etc..
spark.rapids.sql.incomaptibleOps.enabled true
# Decimal support
spark.rapids.sql.decimalType.enabled true
spark.rapids.sql.decimalOverflowGuarantees true
# Regexp activation
spark.rapids.sql.expression.RLike true
spark.rapids.sql.expression.RegExpReplace true
spark.rapids.sql.expression.RegExpExtract true
# CSV parsing
spark.rapids.sql.csv.read.bool.enabled true
spark.rapids.sql.csv.read.byte.enabled false
spark.rapids.sql.csv.read.date.enabled true
spark.rapids.sql.csv.read.double.enabled true
spark.rapids.sql.csv.read.float.enabled true
spark.rapids.sql.csv.read.integer.enabled true
spark.rapids.sql.csv.read.long.enabled true
spark.rapids.sql.csv.read.short.enabled false
spark.rapids.sql.csvTimestamps.enabled true
# JSON format support
spark.rapids.sql.format.json.enabled false
spark.rapids.sql.format.json.read.enabled false
# Casting support
spark.sql.ansi.enabled true
spark.rapids.sql.castFloatToDecimal.enabled true
spark.rapids.sql.castFloatToIntegralTypes.enabled true
spark.rapids.sql.castFloatToString.enabled true
spark.rapids.sql.castStringToFloat.enabled true
spark.rapids.sql.castStringToTimestamp.enabled true
EOF
CONDA_PREFIX=$(conda env list | grep "^$conda_env" | tr '*' ' ' | awk '{print $2}')
cat >${CONDA_PREFIX}/etc/conda/activate.d/spark.sh <<EOF
export SPARK_CONF_DIR=$target_conf_dir
export SPARK_HOME=${CONDA_PREFIX}/lib/python3.8/site-packages/pyspark
# Calculate total mem
if [[ -z "\$SPARK_DRIVER_MEMORY" ]]; then
if [[ -r /proc/meminfo ]]; then
# Linux and cygwin
total_mem=\$(cat /proc/meminfo | grep MemTotal | egrep -o [0-9]+)
else
# MacOS
total_mem=\$(top -l 1 -s 0 | grep ^PhysMem | egrep -o '[0-9]*[GM] *u?n?used' | sed -e 's/G/000000/g' -e 's/M/0000/' | egrep -o '[0-9]+' | paste -s -d+ - | bc)
fi
total_mem=\$((\$total_mem / 2)) # Use half the system memory for Spark Java Heap
if [[ \$total_mem -gt 0 ]]; then
SPARK_DRIVER_MEMORY=\${total_mem}k
fi
fi
EOF
}
install_conda_env
install_rapids
install_rapids_tools
install_spark_conf
echo "Please run 'conda activate rapids' before launching pyspark"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment