Last active
July 18, 2024 15:21
-
-
Save ASKabalan/721209322df82dc1ea2dd2d25af3b7ea to your computer and use it in GitHub Desktop.
Slurm Scripting and Profiling
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
export OUTPUT_FOLDER_ARGS=0 | |
function profile_python() { | |
if [ $# -lt 1 ]; then | |
echo "Usage: profile_python <python_script> [arguments for the script]" | |
return 1 | |
fi | |
local script_name=$(basename "$1" .py) | |
local output_dir="prof_traces/$script_name" | |
local report_dir="out_prof/$gpu_name/$nb_gpus/$script_name" | |
if [ $OUTPUT_FOLDER_ARGS -eq 1 ]; then | |
local args=$(echo "${@:2}" | tr ' ' '_') | |
# Remove characters '/' and '-' from folder name | |
args=$(echo "$args" | tr -d '/-') | |
output_dir="prof_traces/$script_name/$args" | |
report_dir="out_prof/$gpu_name/$nb_gpus/$script_name/$args" | |
fi | |
mkdir -p "$output_dir" | |
mkdir -p "$report_dir" | |
srun nsys profile -t cuda,nvtx,osrt,mpi -o "$report_dir/report_rank%q{SLURM_PROCID}" python "$@" > "$output_dir/$script_name.out" 2> "$output_dir/$script_name.err" || true | |
} | |
function run_python() { | |
if [ $# -lt 1 ]; then | |
echo "Usage: run_python <python_script> [arguments for the script]" | |
return 1 | |
fi | |
local script_name=$(basename "$1" .py) | |
local output_dir="traces/$script_name" | |
if [ $OUTPUT_FOLDER_ARGS -eq 1 ]; then | |
local args=$(echo "${@:2}" | tr ' ' '_') | |
# Remove characters '/' and '-' from folder name | |
args=$(echo "$args" | tr -d '/-') | |
output_dir="traces/$script_name/$args" | |
fi | |
mkdir -p "$output_dir" | |
srun python "$@" > "$output_dir/$script_name.out" 2> "$output_dir/$script_name.err" || true | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
########################################## | |
## SELECT EITHER tkc@a100 OR tkc@v100 ## | |
########################################## | |
#SBATCH --account glc@a100 | |
########################################## | |
#SBATCH --job-name=ProfParticle-Mesh # nom du job | |
# Il est possible d'utiliser une autre partition que celle par default | |
# en activant l'une des 5 directives suivantes : | |
########################################## | |
## SELECT EITHER a100 or v100-32g ## | |
########################################## | |
#SBATCH -C a100 | |
########################################## | |
#****************************************** | |
########################################## | |
## SELECT Number of nodes and GPUs per node | |
## For A100 ntasks-per-node and gres=gpu should be 8 | |
## For V100 ntasks-per-node and gres=gpu should be 4 | |
########################################## | |
#SBATCH --nodes=1 # nombre de noeud | |
#SBATCH --ntasks-per-node=8 # nombre de tache MPI par noeud (= nombre de GPU par noeud) | |
#SBATCH --gres=gpu:8 # nombre de GPU par nœud (max 8 avec gpu_p2, gpu_p5) | |
########################################## | |
## Le nombre de CPU par tache doit etre adapte en fonction de la partition utilisee. Sachant | |
## qu'ici on ne reserve qu'un seul GPU par tache (soit 1/4 ou 1/8 des GPU du noeud suivant | |
## la partition), l'ideal est de reserver 1/4 ou 1/8 des CPU du noeud pour chaque tache: | |
########################################## | |
#SBATCH --cpus-per-task=8 # nombre de CPU par tache pour gpu_p5 (1/8 du noeud 8-GPU) | |
########################################## | |
# /!\ Attention, "multithread" fait reference a l'hyperthreading dans la terminologie Slurm | |
#SBATCH --hint=nomultithread # hyperthreading desactive | |
#SBATCH --time=00:01:00 # temps d'execution maximum demande (HH:MM:SS) | |
#SBATCH --output=%x_%N_a100.out # nom du fichier de sortie | |
#SBATCH --error=%x_%N_a100.out # nom du fichier d'erreur (ici commun avec la sortie) | |
#SBATCH --qos=qos_gpu-dev | |
## SBATCH --exclusive # ressources dediees | |
# Nettoyage des modules charges en interactif et herites par defaut | |
num_nodes=$SLURM_JOB_NUM_NODES | |
num_gpu_per_node=$SLURM_NTASKS_PER_NODE | |
OUTPUT_FOLDER_ARGS=1 | |
# Calculate the number of GPUs | |
nb_gpus=$(( num_nodes * num_gpu_per_node)) | |
module purge | |
# Decommenter la commande module suivante si vous utilisez la partition "gpu_p5" | |
# pour avoir acces aux modules compatibles avec cette partition | |
if [ $num_gpu_per_node -eq 8 ]; then | |
module load cpuarch/amd | |
source /gpfsdswork/projects/rech/tkc/commun/venv/a100/bin/activate | |
else | |
source /gpfsdswork/projects/rech/tkc/commun/venv/v100/bin/activate | |
fi | |
# Chargement des modules | |
module load nvidia-compilers/23.9 cuda/12.2.0 cudnn/8.9.7.29-cuda openmpi/4.1.5-cuda nccl/2.18.5-1-cuda cmake | |
module load nvidia-nsight-systems/2024.1.1.59 | |
echo "The number of nodes allocated for this job is: $num_nodes" | |
echo "The number of GPUs allocated for this job is: $nb_gpus" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment