Created
October 27, 2021 01:41
-
-
Save torrance/85f372091cec1f180bffa63fa8063fda to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/****************************/ | |
/* Slide 6: Docker */ | |
/****************************/ | |
Check you are a member of the docker Group | |
id | |
groups | |
Find the container name you want to use: | |
docker ps -a | |
Show running image and container | |
docker image ls | |
docker container ls | |
To exit and leave your container running | |
control p and then control q | |
<Ctrl-P,Ctrl-Q> to detach from current docker | |
To reattach to a running container | |
docker attach <container id> | |
Note: You may have to start the docker container before attaching. | |
docker start <container id> | |
/*****************************/ | |
/* P15: rocBLAS */ | |
/*****************************/ | |
/opt/rocm/rocBLAS/build/release/clients/staging/rocblas-bench -f gemm -r d -m 8640 -n 8640 -k 8640 --transposeB T --initialization trig_float -i 200 --device 0 & | |
/*****************************/ | |
/* P16: rocFFT */ | |
/*****************************/ | |
//cd ~/rocFFT/build/clients/staging | |
~/rocFFT/build/release/clients/staging/rocfft-rider --length $(( 2 ** 24 )) -b 10 | |
/*****************************/ | |
/* P32: RCCL */ | |
/*****************************/ | |
cd ~/rccl-tests/build | |
ls -l | |
~//rccl-tests/build/all_reduce_perf -b 8 -e 128M -f 2 -g 1 | |
~/rccl-tests/build/all_reduce_perf -b 8 -e 128M -f 2 -g 1 | |
/*****************************/ | |
/* P37: MPI */ | |
/*****************************/ | |
https://github.com/openucx/ucx/wiki/Build-and-run-ROCM-UCX-OpenMPI#Build | |
/************************************/ | |
/* P38: MPI Works mlx5 error | |
/************************************/ | |
/root/mpi/ompi/bin/mpirun -np 2 -mca btl ^openib -x UCX_RNDV_THRESH=2048 --mca osc ucx --mca spml ucx -x LD_LIBRARY_PATH --allow-run-as-root -mca pml ucx -x UCX_TLS=sm,self,rocm_copy,rocm_ipc,rocm_gdr /root/mpi/osu/mpi/pt2pt/osu_latency -d rocm D D | |
/root/mpi/ompi/bin/mpirun -np 2 -mca btl ^openib -x UCX_RNDV_THRESH=2048 --mca osc ucx --mca spml ucx -x LD_LIBRARY_PATH --allow-run-as-root -mca pml ucx -x UCX_TLS=sm,self,rocm_copy,rocm_ipc,rocm_gdr /root/mpi/osu/mpi/pt2pt/osu_bw -d rocm D D | |
/*********************************/ | |
/* P47: Demo Step 1: Preparation */ | |
/*********************************/ | |
cd ~/HIP_Tutorial/Chapter3/02_Matrix_Transpose | |
cat copy.cpp | |
/*********************************************/ | |
/* P48: Demo Step 2: Compiling and Executing */ | |
/*********************************************/ | |
hipcc copy.cpp -o copy | |
./copy | |
/*************************************************************/ | |
/* P49: Demo Step 3: Listing performance counters in rocprof */ | |
/*************************************************************/ | |
rocprof --list-basic | |
rocprof --list-derived | |
/******************************************/ | |
/* P50: Demo Step 4: Running with rocprof */ | |
/******************************************/ | |
rocprof --stats ./copy | |
/**********************************************************/ | |
/* P51: Demo Step 5: Demo Step 5: Kernel profiling output */ | |
/**********************************************************/ | |
cat results.csv | |
/***************************************************************/ | |
/* P52: Demo Step 6: Specifying Performance Monitoring Counter */ | |
/***************************************************************/ | |
cat metrics_copy_kernel.txt | |
rocprof -i metrics_copy_kernel.txt -o metrics_copy_kernel.csv ./copy | |
STEP 7 - Examine descriptions of PMC's | |
You can use | |
--list-basic | |
to look for the descriptions of these 2 counters. They are essentially they are calculating the sum of global memory reads and writes | |
/********************************************************/ | |
/* P54: Demo Step 8: Run Command to Collect PMC metrics */ | |
/********************************************************/ | |
rocprof -i metrics_copy_kernel.txt -o metrics_copy_kernel.csv ./copy | |
/********************************************/ | |
/* P55: Demo Step 9: Reading the PMC output */ | |
/********************************************/ | |
cat metrics_copy.csv | |
/***************************************************************/ | |
/* P56 Demo Step 10: Generating JSON for Google Chrome Tracing */ | |
/***************************************************************/ | |
rocprof --hip-trace ./copy | |
/************************************************************/ | |
/* P57: Demo Step 11: Google Chrome Tracing using JSON file */ | |
/************************************************************/ | |
download: | |
https://raw.githubusercontent.com/paklui/HIP_Tutorial/master/Chapter3/02_Matrix_Transpose/results.json | |
type in Google Chrome: | |
chrome://tracing | |
https://developer.amd.com/resources/rocm-resources/rocm-learning-center/ | |
/************************************************************/ | |
/* P60: BabelStream | |
/************************************************************/ | |
cd ~/BabelStream | |
make -f HIP.make | |
./hip-stream -s $((256 * 1024 * 1024)) | |
/************************************************************/ | |
/* P61: rocHPCG | |
/************************************************************/ | |
MI50 - 16GB HBM2 Memory | |
~/rocHPCG/build/release/bin/rochpcg 280 280 280 60 | |
MI100 32GB HBM2 Memory | |
~/rocHPCG/build/release/bin/rochpcg 560 280 280 60 | |
RESULTS | |
Local domain: 280 x 280 x 280 | |
Global domain: 280 x 280 x 280 | |
Process domain: 1 x 1 x 1 | |
Total Time: 60.87 sec | |
Setup Time: 0.05 sec | |
Optimization Time: 0.21 sec | |
DDOT = 131.2 GFlop/s ( 1049.3 GB/s) 131.2 GFlop/s per process ( 1049.3 GB/s per process) | |
WAXPBY = 61.4 GFlop/s ( 737.4 GB/s) 61.4 GFlop/s per process ( 737.4 GB/s per process) | |
SpMV = 114.2 GFlop/s ( 719.1 GB/s) 114.2 GFlop/s per process ( 719.1 GB/s per process) | |
MG = 161.2 GFlop/s ( 1244.0 GB/s) 161.2 GFlop/s per process ( 1244.0 GB/s per process) | |
Total = 147.7 GFlop/s ( 1120.4 GB/s) 147.7 GFlop/s per process ( 1120.4 GB/s per process) | |
Final = 146.4 GFlop/s ( 1110.0 GB/s) 146.4 GFlop/s per process ( 1110.0 GB/s per process) | |
/************************************************************/ | |
/* P62: rocHPL */ | |
/************************************************************/ | |
cd ~/rocHPL/build/rochpl-install/bin | |
./run_rochpl run_rochpl -P 1 -Q 1 -N 64128 --NB 512 --ppn 1 -v | |
RESULTS | |
Final Score: 4.3277e+03 GFLOPS | |
Residual Check: PASSED | |
Optional HPC Applications | |
/************************************************************/ | |
/* P63: NAMD | |
/************************************************************/ | |
<Ctrl-P,Ctrl-Q> to detach from current docker | |
docker run -it --privileged --ipc=host --network=host --device=/dev/kfd $GPU --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined amddcgpuce/namd-rocm410-ubuntu18 bash | |
Set PATH to NAMD | |
export PATH=$PATH:/opt/namd/Linux-x86_64-g++.hip | |
cp -r /opt/namd/ . | |
cd ~/namd/NAMD_benchmarks/ | |
./get_all_numbers.sh -m 0 | |
/************************************************************/ | |
/* P64: LAMMPS | |
/************************************************************/ | |
HIP-LAMMPS - https://github.com/amddcgpuce/rocmcontainers/tree/main/lammps/rocm410/ubuntu18 | |
docker run -it --privileged --ipc=host --network=host --device=/dev/kfd $GPU --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined amddcgpuce/lammps-rocm410-ubuntu18 bash | |
cd /opt/lammps_install/lammps | |
cp -r examples/ ~/lammps | |
cd ~/lammps/melt | |
/opt/ompi/bin/mpirun --allow-run-as-root -np 1 lmp -in in.melt -sf gpu -pk gpu 1 | |
cd ~/lammps/KAPPA | |
/opt/ompi/bin/mpirun --allow-run-as-root -np 1 lmp -in in.heat -sf gpu -pk gpu 1 | |
/************************************************************/ | |
/* P65: GROMACS | |
/************************************************************/ | |
https://github.com/amddcgpuce/rocmcontainers/tree/main/gromacs/rocm410/ubuntu18 | |
docker run -it --privileged --ipc=host --network=host --device=/dev/kfd $GPU --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined amddcgpuce/gromacs-rocm410-ubuntu18 bash | |
source /usr/local/gromacs/bin/GMXRC | |
cd /opt/gromacs/benchmark/adh_dodec | |
/usr/local/gromacs/bin/gmx_mpi grompp -f pme_verlet.mdp -c conf.gro -p topol.top -maxwarn 20 | |
mpirun --allow-run-as-root -np 2 gmx_mpi mdrun -nsteps 100000 -resetstep 90000 -ntomp 24 -noconfout -nb gpu -bonded cpu -pme gpu -npme 1 -v -nstlist 400 -gpu_id 0 -s topol.tpr | |
/************************************************************/ | |
/* : OpenMM - From New AMD Infinity Hub | |
/************************************************************/ | |
Docker Pull Request - From AMD Infinity Hub | |
docker pull amdih/openmm:7.4.2 | |
docker run -it --privileged --ipc=host --network=host --device=/dev/kfd $GPU --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined amdih/openmm:7.4.2 /bin/bash | |
cd /benchmarks | |
./run-benchmarks | |
MPI Notes | |
cd to MPI Directory e.g cd /root/mpi | |
export the following: | |
export INSTALL_DIR=$PWD | |
export MY_UCX_DIR=$INSTALL_DIR/ucx | |
export OMPI_DIR=$INSTALL_DIR/ompi | |
export GDR_DIR=$INSTALL_DIR/gdrcopy | |
export LD_LIBRARY_PATH=$GDR_DIR/lib64:$LD_LIBRARY_PATH | |
export MPIRUN=$OMPI_DIR/bin/mpirun | |
ROCm-4.1.0 - https://github.com/amddcgpuce/rocmcontainers/tree/main/rocm/rocm410/ubuntu18 | |
Gromacs-2020.3 - https://github.com/amddcgpuce/rocmcontainers/tree/main/gromacs/rocm410/ubuntu18 | |
NAMD3 - https://github.com/amddcgpuce/rocmcontainers/tree/main/namd/rocm410/ubuntu18 | |
OpenMM-HIP - https://github.com/amddcgpuce/rocmcontainers/tree/main/openmm/rocm410/ubuntu18 | |
HIP-LAMMPS - https://github.com/amddcgpuce/rocmcontainers/tree/main/lammps/rocm410/ubuntu18 | |
Pytorch: - https://github.com/amddcgpuce/rocmcontainers/tree/main/pytorch/rocm410/ubuntu18 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment