Skip to content

Instantly share code, notes, and snippets.

@wyphan
Created June 22, 2021 16:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wyphan/02da258f98ca1a030e1dca2c220dee87 to your computer and use it in GitHub Desktop.
Save wyphan/02da258f98ca1a030e1dca2c220dee87 to your computer and use it in GitHub Desktop.
Summit scripts for Nsight Systems and Nsight Compute
#!/bin/bash
#BSUB -P MAT201
#BSUB -W 2:00
#BSUB -nnodes 16
#BSUB -alloc_flags "smt4"
#BSUB -J crpa-nio-pm
#BSUB -N wphan@vols.utk.edu
export exe="elk-nv-prof-acc"
export pp="pp_u4-nv"
export exedir="${HOME}/exciting-plus-gpu/bin"
export cudacheckexe="deviceQuery"
export cudacheckdir="${PROJWORK}/mat201/cuda-samples/NVIDIA_CUDA-10.2_Samples/bin/ppc64le/linux/release"
export jobtitle="crpa-nio-pm"
export txzfile="SUMMIT-PGI-OpenACCsparse-ncu-crpa-NiO-PM-GGA-nk8-ngsh10-ne20.tar.xz"
export logfile="crpa-acc.log"
files="${logfile} env.log modules.log cudaDeviceQuery.log *.in *.OUT q/ *.hdf5 cRPA*.dat *.nsight-cuprof-report"
export destdir="$HOME/exciting+/NiO/PM-GGA-noU"
# Load modules
module unload darshan-runtime
module load job-step-viewer
module load nvhpc/21.3
module load cuda/10.2.89
module load netlib-lapack
module load essl
module load fftw
module load hdf5
module load nvlibs/21.3
module load magma
module load nsight-compute
module list > modules.log 2>&1
# Prepare the job
echo "`date` Job ${LSB_JOBID} launched from `hostname`"
cd ${LS_SUBCWD}
echo "Workdir is `pwd`"
cp ${exedir}/${exe} ./
cp ${exedir}/${pp} ./
env > env.log
# Check CUDA
cp ${cudacheckdir}/${cudacheckexe} ./
jsrun -n 1 -c 1 -a 1 -g 1 ./${cudacheckexe} > cudaDeviceQuery.log
# Write elk-wrapper.sh
cat << EOF > elk-wrapper.sh
#!/bin/bash
export exe="${exe}"
export pfx="crpa-NiO-PM"
# Ranks to profile
export ranks=( 0 )
# Only profile selected ranks
if [[ " \${ranks[@]} " =~ " \${OMPI_COMM_WORLD_RANK} " ]]; then
nv-nsight-cu-cli \
--target-processes all --print-summary=per-gpu \
--metrics "regex:smsp__sass_thread_inst_executed_op_d" \
--kernel-id ::regex:*fillbatch*:1 \
-f -o "\${pfx}_\${OMPI_COMM_WORLD_RANK}" ./\${exe}
fi
if [[ ! " \${ranks[@]} " =~ " \${OMPI_COMM_WORLD_RANK} " ]]; then
./\${exe}
fi
EOF
chmod +x ./elk-wrapper.sh
# Make sure this matches the bsub alloc_flags!
# Number of OpenMP threads per physical core
export smtlv=4
# Number of GPU per resource set
export gpures=1
echo "`date` Launching ${exe} with 6 resource sets per node (3 per socket)"
echo "Each resource set contains 1 rank, ${smtlv} threads, ${gpures} GPU"
jsrun --smpiargs "-disable_gpu_hooks" -r 6 -K 3 -c 7 -a 1 -g ${gpures} \
-E OMP_NUM_THREADS=${smtlv} -E OMP_STACKSIZE=2G \
./elk-wrapper.sh
echo "`date` Done"
if [ -e u4_0000.hdf5 ]; then
# Post-process with pp_u4 (serial code)
echo "`date` Post-processing started"
jsrun -r 1 -c 1 -a 1 -g 0 ./${pp}
echo "`date` Post-processing done"
fi
# Compress outputs and send to home folder
cp ${jobtitle}.${LSB_JOBID} ${logfile}
export XZ_DEFAULTS="-T 0"
tar cJf ${txzfile} ${files}
cp ${txzfile} ${destdir}/
#!/bin/bash
#BSUB -P MAT201
#BSUB -W 2:00
#BSUB -nnodes 48
#BSUB -alloc_flags "smt4 gpumps"
#BSUB -J crpa-la2cuo4-tetra-pm
#BSUB -N wphan@vols.utk.edu
export exe="elk-pgi-prof-acc"
export pp="pp_u4-pgi"
export exedir="${HOME}/exciting-plus-gpu/bin"
export cudacheckexe="deviceQuery"
export cudacheckdir="${PROJWORK}/mat201/cuda-samples/NVIDIA_CUDA-10.2_Samples/bin/ppc64le/linux/release"
export jobtitle="crpa-la2cuo4-tetra-pm"
export txzfile="SUMMIT-PGI-OpenACCsparse-nsys-crpa-La2CuO4-tetra-PM-GGA-nk6-gordonbell.tar.xz"
export logfile="crpa-acc.log"
files="${logfile} env.log modules.log cudaDeviceQuery.log *.in *.OUT q/ *.hdf5 cRPA*.dat *.qdrep"
export destdir="$HOME/exciting+/La2CuO4-tetra/PM-LSDA-noU"
# Load modules
module load job-step-viewer
module load pgi/20.4
module load cuda/10.2.89
module load netlib-lapack
module load essl
module load fftw
module load hdf5
module load pgilibs/20.4
module load magma
module load nsight-systems
module list > modules.log 2>&1
# Prepare the job
echo "`date` Job ${LSB_JOBID} launched from `hostname`"
cd ${LS_SUBCWD}
echo "Workdir is `pwd`"
cp ${exedir}/${exe} ./
cp ${exedir}/${pp} ./
env > env.log
# Check CUDA
cp ${cudacheckdir}/${cudacheckexe} ./
jsrun -n 1 -c 1 -a 1 -g 1 ./${cudacheckexe} > cudaDeviceQuery.log
# Write elk-wrapper.sh
cat << EOF > elk-wrapper.sh
#!/bin/bash
export exe="${exe}"
export pfx="crpa-La2CuO4-PM"
# Use installed NSight Systems
export NSYSDIR="${OLCF_NSIGHT_SYSTEMS_ROOT}/target-linux-ppc64le"
export nsys=\${NSYSDIR}/nsys
# Ranks to profile
export ranks=( 0 )
# Only profile selected ranks
if [[ " \${ranks[@]} " =~ " \${OMPI_COMM_WORLD_RANK} " ]]; then
\${nsys} profile --stats=true --sample=none -t openacc,nvtx -f true -o "\${pfx}_%q{OMPI_COMM_WORLD_RANK}" ./\${exe}
fi
if [[ ! " \${ranks[@]} " =~ " \${OMPI_COMM_WORLD_RANK} " ]]; then
./\${exe}
fi
EOF
chmod +x ./elk-wrapper.sh
# Make sure this matches the bsub alloc_flags!
# Number of OpenMP threads per physical core
export smtlv=4
# Number of GPU per resource set
export gpures=1
echo "`date` Launching ${exe} with 6 resource sets per node (3 per socket)"
echo "Each resource set contains 1 rank, ${smtlv} threads, ${gpures} GPU"
#jsrun --smpiargs="-gpu" -r 6 -K 3 -c 7 -a 1 -g ${gpures} \
jsrun -r 6 -K 3 -c 7 -a 1 -g ${gpures} \
-E OMP_NUM_THREADS=${smtlv} -E OMP_STACKSIZE=2G \
./elk-wrapper.sh
echo "`date` Done"
if [ -e u4_0000.hdf5 ]; then
# Post-process with pp_u4 (serial code)
echo "`date` Post-processing started"
jsrun -r 1 -c 1 -a 1 -g 0 ./${pp}
echo "`date` Post-processing done"
fi
# Compress outputs and send to home folder
cp ${jobtitle}.${LSB_JOBID} ${logfile}
export XZ_DEFAULTS="-T 0"
tar cJf ${txzfile} ${files}
cp ${txzfile} ${destdir}/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment