Skip to content

Instantly share code, notes, and snippets.

@christophernhill
Last active January 28, 2020 15:34
Show Gist options
  • Save christophernhill/bff35521a2fa0c499578c98751be1b3c to your computer and use it in GitHub Desktop.
Save christophernhill/bff35521a2fa0c499578c98751be1b3c to your computer and use it in GitHub Desktop.
Satori IAP 2020 energy profiling code fragments from Florin
#BSUB -L /bin/bash
##
## Begin LSF Directives (change only no of required GPUs processes/GPUs and job-name-single name as desired/need)
## - "-n 4" for single AC922, "-n 8" for 2x AC922s, "-n 16" for 4x AC922s etc
##
#BSUB -J "energy-ai"
#BSUB -o "energy-ai_o.%J"
#BSUB -e "energy-ai_e.%J"
#BSUB -n 16
#BSUB -R "span[ptile=4]"
#BSUB -gpu "num=4"
#BSUB -q "normal"
##BSUB -x
#
# Load performance module
#
module load perftools
#
# Setup User Environement (Python, WMLCE virtual environment etc)
#
HOME2=/nobackup/users/florin
PYTHON_VIRTUAL_ENVIRONMENT=wmlce-1.6.2
CONDA_ROOT=$HOME2/anaconda3
source ${CONDA_ROOT}/etc/profile.d/conda.sh
conda activate $PYTHON_VIRTUAL_ENVIRONMENT
export EGO_TOP=/opt/ibm/spectrumcomputing
# Set up the GPUs and delete any existing scratch directory
cat > setup.sh << EoF_s
#! /bin/sh
##
if [ \${OMPI_COMM_WORLD_LOCAL_RANK} -eq 0 ]
then
PID_RIPC=\${PID_RIPC:-0}
if [ \${PID_RIPC} -eq 0 ]; then
./read_inst_power_cons.sh > energy-consumption.out.\${LSB_JOBID}_\${OMPI_COMM_WORLD_RANK} 2>&1 &
PID_RIPC=\$!
mkdir -p /tmp/\${LSB_JOBID}
touch /tmp/\${LSB_JOBID}/\${PID_RIPC}
fi
sudo satori-ppc64_cpu --smt=2 # Set the SMT mode to 2
sudo satori-ppc64_cpu --smt # Verify the SMT mode
sudo satori-nvidia-smi -rac # For POWER9+V100
sudo satori-nvidia-smi --compute-mode=DEFAULT # Set the compute mode to DEFAULT
/bin/rm -rf /tmp/data.\${USER} # Delete the scratch directory
fi
EoF_s
chmod +x setup.sh
#
# Cleaning CUDA_VISIBLE_DEVICES
#
cat > launch.sh << EoF_s
#! /bin/sh
export CUDA_VISIBLE_DEVICES=0,1,2,3
exec \$*
EoF_s
chmod +x launch.sh
mpirun --tag-output ./setup.sh
#
# Runing the training/inference job
# (change only the script name and options after python command)
#
ddlrun -v \
./launch.sh python \
$HOME2/hpms/tf_cnn_benchmarks/tf_cnn_benchmarks.py --model resnet50 --batch_size 128 --variable_update=horovod --num_batches=1000 --use_fp16
# Reset the GPUs
cat > reset.sh << EoF_r
#! /bin/sh
module load perftools
if [ \${OMPI_COMM_WORLD_LOCAL_RANK} -eq 0 ]
then
PID_RIPC=\`ls /tmp/${LSB_JOBID}\`
if [ \${PID_RIPC} -ne 0 ]; then
kill -TERM \${PID_RIPC}
PID_RIPC=0
fi
/bin/rm -rf /tmp/\${LSB_JOBID}
sudo satori-ppc64_cpu --smt=on # SMT mode back to default
sudo satori-nvidia-smi -rac # Clocks back to default
sudo satori-nvidia-smi --compute-mode=EXCLUSIVE_PROCESS # Compute mode back to p10 default
fi
EoF_r
chmod +x reset.sh
mpirun --tag-output ./reset.sh
# Clean up
/bin/rm -f setup.sh reset.sh launch.sh
#
# EoF
#
#! /bin/sh
export CUDA_VISIBLE_DEVICES=0,1,2,3
exec $*
mpirun --tag-output ./setup.sh
ddlrun -v \
./launch.sh python \
$HOME2/hpms/tf_cnn_benchmarks/tf_cnn_benchmarks.py --model resnet50 --batch_size 128 --variable_update=horovod --num_batches=1000 --use_fp16
mpirun --tag-output ./reset.sh
#! /bin/sh
module load perftools
while [ 1 ]
do
echo -n "`date -u +"%s"`: "
sudo satori-ipmitool dcmi power reading | grep "Instantaneous power reading:"
sleep 3
done
#! /bin/sh
module load perftools
if [ ${OMPI_COMM_WORLD_LOCAL_RANK} -eq 0 ]
then
PID_RIPC=`ls /tmp/${LSB_JOBID}`
if [ ${PID_RIPC} -ne 0 ]; then
kill -TERM ${PID_RIPC}
PID_RIPC=0
fi
/bin/rm -rf /tmp/${LSB_JOBID}
sudo satori-ppc64_cpu --smt=on # SMT mode back to default
sudo satori-nvidia-smi -rac # Clocks back to default
sudo satori-nvidia-smi --compute-mode=EXCLUSIVE_PROCESS # Compute mode back to p10 default
fi
#! /bin/sh
##
if [ ${OMPI_COMM_WORLD_LOCAL_RANK} -eq 0 ]
then
PID_RIPC=${PID_RIPC:-0}
if [ ${PID_RIPC} -eq 0 ]; then
./read_inst_power_cons.sh > energy-consumption.out.${LSB_JOBID}_${OMPI_COMM_WORLD_RANK} 2>&1 &
PID_RIPC=$!
mkdir -p /tmp/${LSB_JOBID}
touch /tmp/${LSB_JOBID}/${PID_RIPC}
fi
sudo satori-ppc64_cpu --smt=2 # Set the SMT mode to 2
sudo satori-ppc64_cpu --smt # Verify the SMT mode
sudo satori-nvidia-smi -rac # For POWER9+V100
sudo satori-nvidia-smi --compute-mode=DEFAULT # Set the compute mode to DEFAULT
/bin/rm -rf /tmp/data.${USER} # Delete the scratch directory
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment