Skip to content

Instantly share code, notes, and snippets.

@vfdev-5
Created July 25, 2023 23:25
Show Gist options
  • Save vfdev-5/6155f9521100ef03a1d16078fcaf9638 to your computer and use it in GitHub Desktop.
Save vfdev-5/6155f9521100ef03a1d16078fcaf9638 to your computer and use it in GitHub Desktop.
ignite circle-ci yml
version: 2.1
parameters:
pytorch_stable_image:
type: string
# https://hub.docker.com/r/pytorch/pytorch/tags
default: "pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime"
pytorch_stable_image_devel:
type: string
# https://hub.docker.com/r/pytorch/pytorch/tags
default: "pytorch/pytorch:2.0.1-cuda11.7-cudnn8-devel"
workingdir:
type: string
default: "/tmp/ignite"
should_build_docker_images:
type: boolean
default: false
should_publish_docker_images:
type: boolean
default: false
# -------------------------------------------------------------------------------------
# Environments to run the jobs in
# -------------------------------------------------------------------------------------
one_gpu: &one_gpu
machine:
# https://circleci.com/docs/2.0/configuration-reference/#available-linux-gpu-images
image: ubuntu-2004-cuda-11.4:202110-01 # CUDA v11.4.2, Docker v20.10.7, nvidia-container-toolkit v1.5.1-1
docker_layer_caching: true
# https://circleci.com/product/features/resource-classes/#linux-vm
resource_class: gpu.nvidia.small
one_gpu_windows: &one_gpu_windows
machine:
resource_class: windows.gpu.nvidia.medium
image: windows-server-2019-nvidia:stable
shell: bash.exe
two_gpus: &two_gpus
machine:
# https://circleci.com/docs/2.0/configuration-reference/#available-linux-gpu-images
image: ubuntu-2004-cuda-11.4:202110-01 # CUDA v11.4.2, Docker v20.10.7, nvidia-container-toolkit v1.5.1-1
docker_layer_caching: true
# https://circleci.com/product/features/resource-classes/#linux-vm
resource_class: gpu.nvidia.medium
# -------------------------------------------------------------------------------------
# Re-usable commands
# -------------------------------------------------------------------------------------
install_latest_nvidia: &install_latest_nvidia
- run:
name: Install latest NVidia-driver and CUDA
command: |
sudo apt-get purge nvidia* && sudo apt-get autoremove
sudo apt-get update && sudo apt-get install -y --no-install-recommends nvidia-driver-470
# Install nvidia-container-runtime
sudo apt-get install -y nvidia-container-runtime
# Reload driver : https://stackoverflow.com/a/45319156/6309199
# lsof | grep nvidia -> kill Xvfb
sudo lsof | grep "/usr/bin/Xvfb" | head -1 | awk '{print $2}' | xargs -I {} sudo kill -9 {} || echo "Command 'sudo lsof ...' is failed"
# lsmod | grep nvidia
sudo rmmod nvidia_uvm && sudo rmmod nvidia_drm && sudo rmmod nvidia_modeset && sudo rmmod nvidia
# reload driver
nvidia-smi
pull_pytorch_stable_image: &pull_pytorch_stable_image
- run:
name: Pull PyTorch Stable Image
command: |
docker pull << pipeline.parameters.pytorch_stable_image >>
pull_pytorch_stable_devel_image: &pull_pytorch_stable_devel_image
- run:
name: Pull PyTorch Stable Develop Image
command: |
docker pull << pipeline.parameters.pytorch_stable_image_devel >>
run_pytorch_container: &run_pytorch_container
- run:
name: Start Pytorch container
environment:
wd: << pipeline.parameters.workingdir >>
command: |
docker run --gpus=all --rm -itd --shm-size 16G -v ${wd}:/ignite -w /ignite --name pthd << pipeline.parameters.pytorch_stable_image >>
docker exec -it pthd nvidia-smi
docker exec -it pthd ls
run_pytorch_devel_container: &run_pytorch_devel_container
- run:
name: Start Pytorch dev container
environment:
wd: << pipeline.parameters.workingdir >>
command: |
docker run --gpus=all --rm -itd --shm-size 16G -v ${wd}:/ignite -w /ignite --name pthd << pipeline.parameters.pytorch_stable_image_devel >>
docker exec -it pthd nvidia-smi
docker exec -it pthd ls
install_dependencies: &install_dependencies
- run:
name: Install dependencies
command: |
docker exec -it pthd pip install -r requirements-dev.txt
# Commented APEX installation, https://github.com/pytorch/ignite/issues/2299
# export install_apex_cmd='pip install -v --disable-pip-version-check --no-cache-dir git+https://github.com/NVIDIA/apex'
# export install_git_apex_cmd="apt-get update && apt-get install -y --no-install-recommends git && ${install_apex_cmd}"
# docker exec -it pthd /bin/bash -c "$install_git_apex_cmd"
export install_ignite_cmd='python setup.py install'
docker exec -it pthd /bin/bash -c "$install_ignite_cmd"
# https://github.com/pytorch/ignite/issues/1737
download_mnist: &download_mnist
- run:
name: Download MNIST
command: |
export install_git_cmd="apt-get update && apt-get install -y --no-install-recommends git"
docker exec -it pthd /bin/bash -c "$install_git_cmd"
export tmp_mnist_dir='/tmp/mnist'
export tests_mnist_dir='/tmp'
export examples_mnist_dir='.'
export download_mnist_cmd="git clone https://github.com/pytorch-ignite/download-mnist-github-action.git $tmp_mnist_dir"
docker exec -it pthd /bin/bash -c "$download_mnist_cmd"
export get_mnist_cmd_tests="python $tmp_mnist_dir/cp.py $tmp_mnist_dir $tests_mnist_dir/MNIST/raw"
docker exec -it pthd /bin/bash -c "$get_mnist_cmd_tests"
export get_mnist_cmd_examples="python $tmp_mnist_dir/cp.py $tmp_mnist_dir $examples_mnist_dir/MNIST/raw"
docker exec -it pthd /bin/bash -c "$get_mnist_cmd_examples"
# -------------------------------------------------------------------------------------
# Jobs to run
# -------------------------------------------------------------------------------------
jobs:
one_gpu_tests:
<<: *one_gpu
working_directory: << pipeline.parameters.workingdir >>
steps:
- checkout
- run:
name: Trigger job if modified
command: |
bash .circleci/trigger_if_modified.sh "^(ignite|tests|examples|\.circleci).*"
- <<: *pull_pytorch_stable_image
- <<: *run_pytorch_container
- <<: *install_dependencies
- <<: *download_mnist
- run:
name: Run GPU Unit Tests and Examples
command: |
# pytest on cuda
export test_cmd='bash tests/run_gpu_tests.sh'
docker exec -it pthd /bin/bash -c "${test_cmd}"
# MNIST tests
# 1) mnist.py
export minst1_cmd='CUDA_VISIBLE_DEVICES=0 python examples/mnist/mnist.py --epochs=1'
docker exec -it pthd /bin/bash -c "$minst1_cmd"
# 2) mnist_with_visdom.py
export visdom_script_cmd='python -c "from visdom.server.build import download_scripts; download_scripts()"'
export visdom_cmd='python -m visdom.server'
docker exec -d pthd /bin/bash -c "$visdom_script_cmd && $visdom_cmd"
export sleep_cmd='sleep 10'
export mnist2_cmd='python examples/mnist/mnist_with_visdom.py --epochs=1'
docker exec -it pthd /bin/bash -c "$sleep_cmd && $mnist2_cmd"
# 3.1) mnist_with_tensorboard.py with tbX
export mnist3_cmd='CUDA_VISIBLE_DEVICES=0 python examples/mnist/mnist_with_tensorboard.py --epochs=1'
docker exec -it pthd /bin/bash -c "$mnist3_cmd"
# uninstall tensorboardX
export pip_cmd='pip uninstall -y tensorboardX'
docker exec -it pthd /bin/bash -c "$pip_cmd"
# 3.2) mnist_with_tensorboard.py with native torch tb
docker exec -it pthd /bin/bash -c "$mnist3_cmd"
# 4) mnist_save_resume_engine.py
# save
export mnist4_cmd='CUDA_VISIBLE_DEVICES=0 python examples/mnist/mnist_save_resume_engine.py --epochs=2 --crash_iteration 1100'
docker exec -it pthd /bin/bash -c "$mnist4_cmd"
# resume
export mnist4_cmd='CUDA_VISIBLE_DEVICES=0 python examples/mnist/mnist_save_resume_engine.py --epochs=2 --resume_from=/tmp/mnist_save_resume/checkpoint_1.pt'
docker exec -it pthd /bin/bash -c "$mnist4_cmd"
one_gpu_windows_tests:
<<: *one_gpu_windows
working_directory: << pipeline.parameters.workingdir >>
steps:
- checkout
- run:
name: Trigger job if modified
command: |
bash .circleci/trigger_if_modified.sh "^(ignite|tests|examples|\.circleci).*"
# - run:
# name: Update CUDA Driver for Windows
# command: |
# curl -O https://raw.githubusercontent.com/pytorch/pytorch/master/.circleci/scripts/windows_cuda_install.sh
# mkdir -p "C:/Program Files (x86)/Microsoft Visual Studio/2019/BuildTools/MSBuild/Microsoft/VC/v160/BuildCustomizations/"
# JOB_EXECUTOR="windows-with-nvidia-gpu" CUDA_VERSION="11.3" VC_PRODUCT="BuildTools" VC_YEAR="2019" bash ./windows_cuda_install.sh
# bash -c "'/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe'"
- run:
name: Install dependencies
command: |
conda --version
# We have to use cuda 10.2 on Windows:
# https://github.com/pytorch/ignite/issues/1843
conda install -y pytorch==1.9.1 torchvision cudatoolkit=10.2 -c pytorch
pip install -r requirements-dev.txt
pip install .
python -c "import torch; print(torch.__version__, torch.version.cuda, torch.cuda.is_available())"
python -c "import torch; torch.cuda.is_available()"
- run:
# https://github.com/pytorch/ignite/issues/1737
name: Download MNIST
command: |
git clone https://github.com/pytorch-ignite/download-mnist-github-action.git /tmp/mnist
python /tmp/mnist/cp.py /tmp/mnist /tmp/MNIST/raw
- run:
name: Run GPU Unit Tests
command: |
# pytest on cuda
SKIP_DISTRIB_TESTS=1 bash tests/run_gpu_tests.sh
two_gpus_tests:
<<: *two_gpus
working_directory: << pipeline.parameters.workingdir >>
steps:
- checkout
- run:
name: Trigger job if modified
command: |
bash .circleci/trigger_if_modified.sh "^(ignite|tests|examples|\.circleci).*"
- <<: *pull_pytorch_stable_image
- <<: *run_pytorch_container
- <<: *install_dependencies
- <<: *download_mnist
- run:
name: Run 1 Node 2 GPUs Unit Tests
command: |
export test_cmd='bash tests/run_gpu_tests.sh 2'
docker exec -it pthd /bin/bash -c "${test_cmd}"
two_gpus_check_dist_cifar10_example:
<<: *two_gpus
working_directory: << pipeline.parameters.workingdir >>
steps:
- checkout
- run:
name: Trigger job if modified
command: |
bash .circleci/trigger_if_modified.sh "^(ignite|tests|examples|\.circleci).*"
- <<: *pull_pytorch_stable_image
- <<: *run_pytorch_container
- <<: *install_dependencies
- run:
name: "Install additional example dependencies"
command: |
docker exec -it pthd pip install fire
- run:
name: "Run without backend"
command: |
export example_path="examples/contrib/cifar10"
# initial run
export stop_cmd="--stop_iteration=500"
export test_cmd="CI=1 python ${example_path}/main.py run --checkpoint_every=200"
docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}"
# resume
export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-500/training_checkpoint_400.pt"
docker exec -it pthd /bin/bash -c "${test_cmd} --num_epochs=7 ${resume_opt}"
- run:
name: "Run with NCCL backend using torchrun"
command: |
export example_path="examples/contrib/cifar10"
# initial run
export stop_cmd="--stop_iteration=500"
export test_cmd="CI=1 torchrun --nproc_per_node=2 ${example_path}/main.py run --backend=nccl --checkpoint_every=200"
docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}"
# resume
export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt"
docker exec -it pthd /bin/bash -c "${test_cmd} --num_epochs=7 ${resume_opt}"
- run:
name: "Run with NCCL backend using spawn"
command: |
export example_path="examples/contrib/cifar10"
# initial run
export stop_cmd="--stop_iteration=500"
export test_cmd="CI=1 python -u ${example_path}/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200"
docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}"
# resume
export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt"
docker exec -it pthd /bin/bash -c "${test_cmd} --num_epochs=7 ${resume_opt}"
two_gpus_hvd_tests:
<<: *two_gpus
working_directory: << pipeline.parameters.workingdir >>
steps:
- checkout
- run:
name: Trigger job if modified
command: |
bash .circleci/trigger_if_modified.sh "^(ignite|tests|examples|\.circleci).*"
- <<: *pull_pytorch_stable_devel_image
- <<: *run_pytorch_devel_container
- <<: *install_dependencies
- <<: *download_mnist
- run:
name: "Install Horovod with NCCL GPU ops"
command: |
# Following https://github.com/horovod/horovod/blob/master/Dockerfile.test.gpu
# and https://github.com/horovod/horovod/issues/1944#issuecomment-628192778
docker exec -it pthd /bin/bash -c "apt-get update && apt-get install -y git"
docker exec -it pthd /bin/bash -c "git clone --recursive https://github.com/horovod/horovod.git -b v0.23.0 /horovod && cd /horovod && python setup.py sdist"
docker exec -it pthd /bin/bash -c "conda install -y cmake nccl=2.11 -c conda-forge"
docker exec -it pthd /bin/bash -c 'cd /horovod && HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_NCCL_LINK=SHARED HOROVOD_WITHOUT_MPI=1 HOROVOD_WITH_PYTORCH=1 pip install -v $(ls /horovod/dist/horovod-*.tar.gz) && ldconfig'
docker exec -it pthd horovodrun --check-build
- run:
name: Run 1 Node 2 GPUs Unit Tests
command: |
export test_cmd='bash tests/run_gpu_tests.sh 2 hvd'
docker exec -it pthd /bin/bash -c "${test_cmd}"
# no CUDA devices Horovod tests
export test_cmd='CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k hvd'
docker exec -it pthd /bin/bash -c "${test_cmd}"
- run:
name: "Check CIFAR10 using horovodrun"
command: |
docker exec -it pthd pip install fire
export example_path="examples/contrib/cifar10"
# initial run
export stop_cmd="--stop_iteration=500"
export test_cmd="cd ${example_path} && CI=1 horovodrun -np 2 python -u main.py run --backend=horovod --checkpoint_every=200"
docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}"
# resume
export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt"
docker exec -it pthd /bin/bash -c "${test_cmd} --num_epochs=7 ${resume_opt}"
- run:
name: "Check CIFAR10 using spawn"
command: |
export example_path="examples/contrib/cifar10"
# initial run
export stop_cmd="--stop_iteration=500"
export test_cmd="cd ${example_path} && CI=1 python -u main.py run --backend=horovod --nproc_per_node=2 --checkpoint_every=200"
docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}"
# resume
export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt"
docker exec -it pthd /bin/bash -c "${test_cmd} --num_epochs=7 ${resume_opt}"
build_publish_docker_images:
# https://circleci.com/docs/2.0/building-docker-images/
docker:
- image: cimg/python:3.8.8
# https://circleci.com/docs/2.0/executor-types/#available-docker-resource-classes
resource_class: 2xlarge
working_directory: << pipeline.parameters.workingdir >>
steps:
- checkout
- setup_remote_docker:
version: 19.03.14
docker_layer_caching: true
- run:
name: Install deps
command: |
pip --version
pip install docker
- run:
name: Build all Horovod flavoured PyTorch-Ignite images
command: |
cd docker
export PTH_VERSION=`python -c "import configparser; cfg=configparser.ConfigParser(); cfg.read('docker.cfg'); print(cfg.get('DEFAULT', 'build_docker_image_pytorch_version'))"`
export HVD_VERSION=`python -c "import configparser; cfg=configparser.ConfigParser(); cfg.read('docker.cfg'); print(cfg.get('DEFAULT', 'build_docker_image_hvd_version'))"`
bash build.sh hvd hvd-base
bash build.sh hvd hvd-vision
bash build.sh hvd hvd-nlp
bash build.sh hvd hvd-apex
bash build.sh hvd hvd-apex-vision
bash build.sh hvd hvd-apex-nlp
- run:
name: Build all PyTorch-Ignite images
command: |
cd docker
export PTH_VERSION=`python -c "import configparser; cfg=configparser.ConfigParser(); cfg.read('docker.cfg'); print(cfg.get('DEFAULT', 'build_docker_image_pytorch_version'))"`
bash build.sh main base
bash build.sh main vision
bash build.sh main nlp
bash build.sh main apex
bash build.sh main apex-vision
bash build.sh main apex-nlp
- run:
name: Build all MS DeepSpeed flavoured PyTorch-Ignite images
command: |
cd docker
export PTH_VERSION=`python -c "import configparser; cfg=configparser.ConfigParser(); cfg.read('docker.cfg'); print(cfg.get('DEFAULT', 'build_docker_image_pytorch_version'))"`
export MSDP_VERSION=`python -c "import configparser; cfg=configparser.ConfigParser(); cfg.read('docker.cfg'); print(cfg.get('DEFAULT', 'build_docker_image_msdp_version'))"`
bash build.sh msdp msdp-apex
bash build.sh msdp msdp-apex-vision
bash build.sh msdp msdp-apex-nlp
- run:
name: List built images
command: docker images | grep pytorchignite
- when:
condition: << pipeline.parameters.should_publish_docker_images >>
steps:
- run:
name: Push all PyTorch-Ignite Docker images
command: |
cd docker
sh ./push_all.sh
# -------------------------------------------------------------------------------------
# Workflows
# -------------------------------------------------------------------------------------
workflows:
version: 2
gpu_tests:
unless: << pipeline.parameters.should_build_docker_images >>
jobs:
- one_gpu_tests
# Disabled windows tests as NVidia driver is too old
# > c:\tools\miniconda3\lib\site-packages\torch\cuda\__init__.py:52: UserWarning: CUDA initialization: The NVIDIA driver on your system is too old (found version 10010). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at ..\c10\cuda\CUDAFunctions.cpp:115.)
# > return torch._C._cuda_getDeviceCount() > 0
# - one_gpu_windows_tests
# Can not run tests on 2 GPUs on Circle-CI
# Now, they are running on GHA self-hosted
# - two_gpus_tests
# - two_gpus_check_dist_cifar10_example
# - two_gpus_hvd_tests
docker_images:
when: << pipeline.parameters.should_build_docker_images >>
jobs:
- build_publish_docker_images
#!/bin/bash
# Script is taken from https://circleci.com/developer/orbs/orb/roopakv/swissknife#commands-run_if_modified
# Usage: sh trigger_if_modified.sh <pattern> [base-branch]
# - for example: sh trigger_if_modified.sh "^(ignite|tests|examples|\.circleci).*"
if [ -z "$1" ]; then
echo "Pattern should be provided. Usage: sh trigger_if_modified.sh <pattern>"
exit 1
fi
pattern=$1
if [ -z "$2" ]; then
base_branch=master
else
base_branch=$2
fi
echo "- Pattern: ${pattern}"
echo "- Base branch: ${base_branch}"
if [ -z "$BASH" ]; then
echo Bash not installed.
exit 1
fi
git status >/dev/null 2>&1 || { echo >&2 "Not in a git directory or no git"; exit 1; }
circleci-agent >/dev/null 2>&1 || { echo >&2 "No Circle CI agent. These are in all Circle CI containers"; exit 1; }
if [ "$CIRCLE_BRANCH" == "master" ]; then
echo "Skip checking modified files if on master"
exit 0
fi
FILES_MODIFIED=""
setcommit () {
FILES_MODIFIED=$(git diff --name-only origin/${base_branch}..HEAD | grep -i -E ${pattern})
}
setcommit || true
if [ -z "$FILES_MODIFIED" ]; then
echo "Files not modified. Halting job"
circleci-agent step halt
else
echo "Files modified: ${FILES_MODIFIED}, continuing steps"
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment