Last active
April 10, 2022 18:35
-
-
Save rluta/043ecf7a9079285de996571e85e3a65c to your computer and use it in GitHub Desktop.
Set-up pyspark + rapids
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash --login | |
# To work properly, script must be run on an instance with the following pre-requisite: | |
# - a Nvidia GPU (Turing or more recent) | |
# - an adequate GPU kernel driver | |
# - a miniconda installation | |
# - a JDK installed (8 or 11) | |
set -ex | |
RAPIDS_VERSION=${RAPIDS_VERSION:-22.02.0} | |
CUDF_VERSION=${CUDF_VERSION:-22.02.0} | |
function install_conda_env() { | |
name=${1:-rapids} | |
conda create -n "$name" -y -c rapidsai -c nvidia -c conda-forge -c defaults python=3.8 pip cudatoolkit=11.5 pyspark=3.2.1 | |
CONDA_PREFIX=$(conda env list | grep "^$name" | tr '*' ' ' | awk '{print $2}') | |
mkdir -p "$CONDA_PREFIX/etc/conda/activate.d/" | |
echo "export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib:\$LD_LIBRARY_PATH" > ${CONDA_PREFIX}/etc/conda/activate.d/cuda.sh | |
} | |
function install_rapids() { | |
target_dir=${1:-$HOME} | |
pushd "$target_dir" | |
rm -f "rapids-4-spark_2.12-${RAPIDS_VERSION}.jar" "cudf-${CUDF_VERSION}-cuda11.jar" | |
wget "https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${RAPIDS_VERSION}/rapids-4-spark_2.12-${RAPIDS_VERSION}.jar" | |
wget "https://repo1.maven.org/maven2/ai/rapids/cudf/${CUDF_VERSION}/cudf-${CUDF_VERSION}-cuda11.jar" | |
popd | |
} | |
function install_rapids_tools() { | |
target_dir=${1:-$HOME} | |
conda_env=${2:-rapids} | |
CONDA_PREFIX=$(conda env list | grep "^$conda_env" | tr '*' ' ' | awk '{print $2}') | |
pushd "$target_dir" | |
rm -f "rapids-4-spark-tools_2.12-${RAPIDS_VERSION}.jar" | |
wget "https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-tools_2.12/${RAPIDS_VERSION}/rapids-4-spark-tools_2.12-${RAPIDS_VERSION}.jar" | |
cat >${CONDA_PREFIX}/etc/conda/activate.d/rapids.sh <<EOF | |
alias rapids_qualify="java -cp ${target_dir}/rapids-4-spark-tools_2.12-${RAPIDS_VERSION}.jar:\$SPARK_HOME/jars/* com.nvidia.spark.rapids.tool.qualification.QualificationMain" | |
alias rapids_profile="java -cp ${target_dir}/rapids-4-spark-tools_2.12-${RAPIDS_VERSION}.jar:\$SPARK_HOME/jars/* com.nvidia.spark.rapids.tool.profiling.ProfileMain" | |
EOF | |
popd | |
} | |
function install_spark_conf() { | |
target_dir=${1:-${HOME}} | |
target_conf_dir="${target_dir}/conf" | |
target_logs_dir="${target_dir}/logs" | |
conda_env=${2:-rapids} | |
mkdir -p "$target_conf_dir" "$target_logs_dir" | |
cat >"${target_conf_dir}/spark-defaults.conf" <<EOF | |
# Example config options for Spark Rapids | |
# Basic enablers | |
spark.jars ${target_dir}/rapids-4-spark_2.12-${RAPIDS_VERSION}.jar,${target_dir}/cudf-${CUDF_VERSION}-cuda11.jar | |
spark.plugins com.nvidia.spark.SQLPlugin | |
spark.rapids.sql.enabled true | |
spark.rapids.sql.explain ALL | |
# Spark 3 and Rapids tuning | |
# 384M: 402653184 | |
# 512M: 536870912 | |
# 1G: 1073741824 | |
# 1.5G: 1610612736 | |
# 1.8G: 1932735284 | |
# 2G: 2147483648 | |
spark.sql.sources.useV1SourceList | |
spark.rapids.sql.concurrentGpuTasks 2 | |
spark.rapids.sql.batchSizeBytes 1932735284 | |
spark.sql.files.maxPartitionBytes 402653184 | |
# Needs to be adjusted based on real available memory | |
spark.rapids.memory.pinnedPool.size 2G | |
spark.rapids.memory.host.spillStorageSize 2G | |
# Enable event logs collection for profiling | |
spark.eventLog.enabled true | |
spark.eventLog.dir ${target_logs_dir} | |
spark.history.fs.logDirectory ${target_logs_dir} | |
# AQE settings | |
spark.sql.adaptive.enabled true | |
spark.sql.adaptive.advisoryPartitionSizeInBytes 384M | |
spark.sql.adaptive.coalescePartitions.initialPartitionNum 512 | |
# Rapids compatibility options | |
# Precentiles, etc.. | |
spark.rapids.sql.incomaptibleOps.enabled true | |
# Decimal support | |
spark.rapids.sql.decimalType.enabled true | |
spark.rapids.sql.decimalOverflowGuarantees true | |
# Regexp activation | |
spark.rapids.sql.expression.RLike true | |
spark.rapids.sql.expression.RegExpReplace true | |
spark.rapids.sql.expression.RegExpExtract true | |
# CSV parsing | |
spark.rapids.sql.csv.read.bool.enabled true | |
spark.rapids.sql.csv.read.byte.enabled false | |
spark.rapids.sql.csv.read.date.enabled true | |
spark.rapids.sql.csv.read.double.enabled true | |
spark.rapids.sql.csv.read.float.enabled true | |
spark.rapids.sql.csv.read.integer.enabled true | |
spark.rapids.sql.csv.read.long.enabled true | |
spark.rapids.sql.csv.read.short.enabled false | |
spark.rapids.sql.csvTimestamps.enabled true | |
# JSON format support | |
spark.rapids.sql.format.json.enabled false | |
spark.rapids.sql.format.json.read.enabled false | |
# Casting support | |
spark.sql.ansi.enabled true | |
spark.rapids.sql.castFloatToDecimal.enabled true | |
spark.rapids.sql.castFloatToIntegralTypes.enabled true | |
spark.rapids.sql.castFloatToString.enabled true | |
spark.rapids.sql.castStringToFloat.enabled true | |
spark.rapids.sql.castStringToTimestamp.enabled true | |
EOF | |
CONDA_PREFIX=$(conda env list | grep "^$conda_env" | tr '*' ' ' | awk '{print $2}') | |
cat >${CONDA_PREFIX}/etc/conda/activate.d/spark.sh <<EOF | |
export SPARK_CONF_DIR=$target_conf_dir | |
export SPARK_HOME=${CONDA_PREFIX}/lib/python3.8/site-packages/pyspark | |
# Calculate total mem | |
if [[ -z "\$SPARK_DRIVER_MEMORY" ]]; then | |
if [[ -r /proc/meminfo ]]; then | |
# Linux and cygwin | |
total_mem=\$(cat /proc/meminfo | grep MemTotal | egrep -o [0-9]+) | |
else | |
# MacOS | |
total_mem=\$(top -l 1 -s 0 | grep ^PhysMem | egrep -o '[0-9]*[GM] *u?n?used' | sed -e 's/G/000000/g' -e 's/M/0000/' | egrep -o '[0-9]+' | paste -s -d+ - | bc) | |
fi | |
total_mem=\$((\$total_mem / 2)) # Use half the system memory for Spark Java Heap | |
if [[ \$total_mem -gt 0 ]]; then | |
SPARK_DRIVER_MEMORY=\${total_mem}k | |
fi | |
fi | |
EOF | |
} | |
install_conda_env | |
install_rapids | |
install_rapids_tools | |
install_spark_conf | |
echo "Please run 'conda activate rapids' before launching pyspark" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment