-
-
Save vfdev-5/6155f9521100ef03a1d16078fcaf9638 to your computer and use it in GitHub Desktop.
ignite circle-ci yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
version: 2.1 | |
parameters: | |
pytorch_stable_image: | |
type: string | |
# https://hub.docker.com/r/pytorch/pytorch/tags | |
default: "pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime" | |
pytorch_stable_image_devel: | |
type: string | |
# https://hub.docker.com/r/pytorch/pytorch/tags | |
default: "pytorch/pytorch:2.0.1-cuda11.7-cudnn8-devel" | |
workingdir: | |
type: string | |
default: "/tmp/ignite" | |
should_build_docker_images: | |
type: boolean | |
default: false | |
should_publish_docker_images: | |
type: boolean | |
default: false | |
# ------------------------------------------------------------------------------------- | |
# Environments to run the jobs in | |
# ------------------------------------------------------------------------------------- | |
one_gpu: &one_gpu | |
machine: | |
# https://circleci.com/docs/2.0/configuration-reference/#available-linux-gpu-images | |
image: ubuntu-2004-cuda-11.4:202110-01 # CUDA v11.4.2, Docker v20.10.7, nvidia-container-toolkit v1.5.1-1 | |
docker_layer_caching: true | |
# https://circleci.com/product/features/resource-classes/#linux-vm | |
resource_class: gpu.nvidia.small | |
one_gpu_windows: &one_gpu_windows | |
machine: | |
resource_class: windows.gpu.nvidia.medium | |
image: windows-server-2019-nvidia:stable | |
shell: bash.exe | |
two_gpus: &two_gpus | |
machine: | |
# https://circleci.com/docs/2.0/configuration-reference/#available-linux-gpu-images | |
image: ubuntu-2004-cuda-11.4:202110-01 # CUDA v11.4.2, Docker v20.10.7, nvidia-container-toolkit v1.5.1-1 | |
docker_layer_caching: true | |
# https://circleci.com/product/features/resource-classes/#linux-vm | |
resource_class: gpu.nvidia.medium | |
# ------------------------------------------------------------------------------------- | |
# Re-usable commands | |
# ------------------------------------------------------------------------------------- | |
install_latest_nvidia: &install_latest_nvidia | |
- run: | |
name: Install latest NVidia-driver and CUDA | |
command: | | |
sudo apt-get purge nvidia* && sudo apt-get autoremove | |
sudo apt-get update && sudo apt-get install -y --no-install-recommends nvidia-driver-470 | |
# Install nvidia-container-runtime | |
sudo apt-get install -y nvidia-container-runtime | |
# Reload driver : https://stackoverflow.com/a/45319156/6309199 | |
# lsof | grep nvidia -> kill Xvfb | |
sudo lsof | grep "/usr/bin/Xvfb" | head -1 | awk '{print $2}' | xargs -I {} sudo kill -9 {} || echo "Command 'sudo lsof ...' is failed" | |
# lsmod | grep nvidia | |
sudo rmmod nvidia_uvm && sudo rmmod nvidia_drm && sudo rmmod nvidia_modeset && sudo rmmod nvidia | |
# reload driver | |
nvidia-smi | |
pull_pytorch_stable_image: &pull_pytorch_stable_image | |
- run: | |
name: Pull PyTorch Stable Image | |
command: | | |
docker pull << pipeline.parameters.pytorch_stable_image >> | |
pull_pytorch_stable_devel_image: &pull_pytorch_stable_devel_image | |
- run: | |
name: Pull PyTorch Stable Develop Image | |
command: | | |
docker pull << pipeline.parameters.pytorch_stable_image_devel >> | |
run_pytorch_container: &run_pytorch_container | |
- run: | |
name: Start Pytorch container | |
environment: | |
wd: << pipeline.parameters.workingdir >> | |
command: | | |
docker run --gpus=all --rm -itd --shm-size 16G -v ${wd}:/ignite -w /ignite --name pthd << pipeline.parameters.pytorch_stable_image >> | |
docker exec -it pthd nvidia-smi | |
docker exec -it pthd ls | |
run_pytorch_devel_container: &run_pytorch_devel_container | |
- run: | |
name: Start Pytorch dev container | |
environment: | |
wd: << pipeline.parameters.workingdir >> | |
command: | | |
docker run --gpus=all --rm -itd --shm-size 16G -v ${wd}:/ignite -w /ignite --name pthd << pipeline.parameters.pytorch_stable_image_devel >> | |
docker exec -it pthd nvidia-smi | |
docker exec -it pthd ls | |
install_dependencies: &install_dependencies | |
- run: | |
name: Install dependencies | |
command: | | |
docker exec -it pthd pip install -r requirements-dev.txt | |
# Commented APEX installation, https://github.com/pytorch/ignite/issues/2299 | |
# export install_apex_cmd='pip install -v --disable-pip-version-check --no-cache-dir git+https://github.com/NVIDIA/apex' | |
# export install_git_apex_cmd="apt-get update && apt-get install -y --no-install-recommends git && ${install_apex_cmd}" | |
# docker exec -it pthd /bin/bash -c "$install_git_apex_cmd" | |
export install_ignite_cmd='python setup.py install' | |
docker exec -it pthd /bin/bash -c "$install_ignite_cmd" | |
# https://github.com/pytorch/ignite/issues/1737 | |
download_mnist: &download_mnist | |
- run: | |
name: Download MNIST | |
command: | | |
export install_git_cmd="apt-get update && apt-get install -y --no-install-recommends git" | |
docker exec -it pthd /bin/bash -c "$install_git_cmd" | |
export tmp_mnist_dir='/tmp/mnist' | |
export tests_mnist_dir='/tmp' | |
export examples_mnist_dir='.' | |
export download_mnist_cmd="git clone https://github.com/pytorch-ignite/download-mnist-github-action.git $tmp_mnist_dir" | |
docker exec -it pthd /bin/bash -c "$download_mnist_cmd" | |
export get_mnist_cmd_tests="python $tmp_mnist_dir/cp.py $tmp_mnist_dir $tests_mnist_dir/MNIST/raw" | |
docker exec -it pthd /bin/bash -c "$get_mnist_cmd_tests" | |
export get_mnist_cmd_examples="python $tmp_mnist_dir/cp.py $tmp_mnist_dir $examples_mnist_dir/MNIST/raw" | |
docker exec -it pthd /bin/bash -c "$get_mnist_cmd_examples" | |
# ------------------------------------------------------------------------------------- | |
# Jobs to run | |
# ------------------------------------------------------------------------------------- | |
jobs: | |
one_gpu_tests: | |
<<: *one_gpu | |
working_directory: << pipeline.parameters.workingdir >> | |
steps: | |
- checkout | |
- run: | |
name: Trigger job if modified | |
command: | | |
bash .circleci/trigger_if_modified.sh "^(ignite|tests|examples|\.circleci).*" | |
- <<: *pull_pytorch_stable_image | |
- <<: *run_pytorch_container | |
- <<: *install_dependencies | |
- <<: *download_mnist | |
- run: | |
name: Run GPU Unit Tests and Examples | |
command: | | |
# pytest on cuda | |
export test_cmd='bash tests/run_gpu_tests.sh' | |
docker exec -it pthd /bin/bash -c "${test_cmd}" | |
# MNIST tests | |
# 1) mnist.py | |
export minst1_cmd='CUDA_VISIBLE_DEVICES=0 python examples/mnist/mnist.py --epochs=1' | |
docker exec -it pthd /bin/bash -c "$minst1_cmd" | |
# 2) mnist_with_visdom.py | |
export visdom_script_cmd='python -c "from visdom.server.build import download_scripts; download_scripts()"' | |
export visdom_cmd='python -m visdom.server' | |
docker exec -d pthd /bin/bash -c "$visdom_script_cmd && $visdom_cmd" | |
export sleep_cmd='sleep 10' | |
export mnist2_cmd='python examples/mnist/mnist_with_visdom.py --epochs=1' | |
docker exec -it pthd /bin/bash -c "$sleep_cmd && $mnist2_cmd" | |
# 3.1) mnist_with_tensorboard.py with tbX | |
export mnist3_cmd='CUDA_VISIBLE_DEVICES=0 python examples/mnist/mnist_with_tensorboard.py --epochs=1' | |
docker exec -it pthd /bin/bash -c "$mnist3_cmd" | |
# uninstall tensorboardX | |
export pip_cmd='pip uninstall -y tensorboardX' | |
docker exec -it pthd /bin/bash -c "$pip_cmd" | |
# 3.2) mnist_with_tensorboard.py with native torch tb | |
docker exec -it pthd /bin/bash -c "$mnist3_cmd" | |
# 4) mnist_save_resume_engine.py | |
# save | |
export mnist4_cmd='CUDA_VISIBLE_DEVICES=0 python examples/mnist/mnist_save_resume_engine.py --epochs=2 --crash_iteration 1100' | |
docker exec -it pthd /bin/bash -c "$mnist4_cmd" | |
# resume | |
export mnist4_cmd='CUDA_VISIBLE_DEVICES=0 python examples/mnist/mnist_save_resume_engine.py --epochs=2 --resume_from=/tmp/mnist_save_resume/checkpoint_1.pt' | |
docker exec -it pthd /bin/bash -c "$mnist4_cmd" | |
one_gpu_windows_tests: | |
<<: *one_gpu_windows | |
working_directory: << pipeline.parameters.workingdir >> | |
steps: | |
- checkout | |
- run: | |
name: Trigger job if modified | |
command: | | |
bash .circleci/trigger_if_modified.sh "^(ignite|tests|examples|\.circleci).*" | |
# - run: | |
# name: Update CUDA Driver for Windows | |
# command: | | |
# curl -O https://raw.githubusercontent.com/pytorch/pytorch/master/.circleci/scripts/windows_cuda_install.sh | |
# mkdir -p "C:/Program Files (x86)/Microsoft Visual Studio/2019/BuildTools/MSBuild/Microsoft/VC/v160/BuildCustomizations/" | |
# JOB_EXECUTOR="windows-with-nvidia-gpu" CUDA_VERSION="11.3" VC_PRODUCT="BuildTools" VC_YEAR="2019" bash ./windows_cuda_install.sh | |
# bash -c "'/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe'" | |
- run: | |
name: Install dependencies | |
command: | | |
conda --version | |
# We have to use cuda 10.2 on Windows: | |
# https://github.com/pytorch/ignite/issues/1843 | |
conda install -y pytorch==1.9.1 torchvision cudatoolkit=10.2 -c pytorch | |
pip install -r requirements-dev.txt | |
pip install . | |
python -c "import torch; print(torch.__version__, torch.version.cuda, torch.cuda.is_available())" | |
python -c "import torch; torch.cuda.is_available()" | |
- run: | |
# https://github.com/pytorch/ignite/issues/1737 | |
name: Download MNIST | |
command: | | |
git clone https://github.com/pytorch-ignite/download-mnist-github-action.git /tmp/mnist | |
python /tmp/mnist/cp.py /tmp/mnist /tmp/MNIST/raw | |
- run: | |
name: Run GPU Unit Tests | |
command: | | |
# pytest on cuda | |
SKIP_DISTRIB_TESTS=1 bash tests/run_gpu_tests.sh | |
two_gpus_tests: | |
<<: *two_gpus | |
working_directory: << pipeline.parameters.workingdir >> | |
steps: | |
- checkout | |
- run: | |
name: Trigger job if modified | |
command: | | |
bash .circleci/trigger_if_modified.sh "^(ignite|tests|examples|\.circleci).*" | |
- <<: *pull_pytorch_stable_image | |
- <<: *run_pytorch_container | |
- <<: *install_dependencies | |
- <<: *download_mnist | |
- run: | |
name: Run 1 Node 2 GPUs Unit Tests | |
command: | | |
export test_cmd='bash tests/run_gpu_tests.sh 2' | |
docker exec -it pthd /bin/bash -c "${test_cmd}" | |
two_gpus_check_dist_cifar10_example: | |
<<: *two_gpus | |
working_directory: << pipeline.parameters.workingdir >> | |
steps: | |
- checkout | |
- run: | |
name: Trigger job if modified | |
command: | | |
bash .circleci/trigger_if_modified.sh "^(ignite|tests|examples|\.circleci).*" | |
- <<: *pull_pytorch_stable_image | |
- <<: *run_pytorch_container | |
- <<: *install_dependencies | |
- run: | |
name: "Install additional example dependencies" | |
command: | | |
docker exec -it pthd pip install fire | |
- run: | |
name: "Run without backend" | |
command: | | |
export example_path="examples/contrib/cifar10" | |
# initial run | |
export stop_cmd="--stop_iteration=500" | |
export test_cmd="CI=1 python ${example_path}/main.py run --checkpoint_every=200" | |
docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}" | |
# resume | |
export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-500/training_checkpoint_400.pt" | |
docker exec -it pthd /bin/bash -c "${test_cmd} --num_epochs=7 ${resume_opt}" | |
- run: | |
name: "Run with NCCL backend using torchrun" | |
command: | | |
export example_path="examples/contrib/cifar10" | |
# initial run | |
export stop_cmd="--stop_iteration=500" | |
export test_cmd="CI=1 torchrun --nproc_per_node=2 ${example_path}/main.py run --backend=nccl --checkpoint_every=200" | |
docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}" | |
# resume | |
export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt" | |
docker exec -it pthd /bin/bash -c "${test_cmd} --num_epochs=7 ${resume_opt}" | |
- run: | |
name: "Run with NCCL backend using spawn" | |
command: | | |
export example_path="examples/contrib/cifar10" | |
# initial run | |
export stop_cmd="--stop_iteration=500" | |
export test_cmd="CI=1 python -u ${example_path}/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200" | |
docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}" | |
# resume | |
export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt" | |
docker exec -it pthd /bin/bash -c "${test_cmd} --num_epochs=7 ${resume_opt}" | |
two_gpus_hvd_tests: | |
<<: *two_gpus | |
working_directory: << pipeline.parameters.workingdir >> | |
steps: | |
- checkout | |
- run: | |
name: Trigger job if modified | |
command: | | |
bash .circleci/trigger_if_modified.sh "^(ignite|tests|examples|\.circleci).*" | |
- <<: *pull_pytorch_stable_devel_image | |
- <<: *run_pytorch_devel_container | |
- <<: *install_dependencies | |
- <<: *download_mnist | |
- run: | |
name: "Install Horovod with NCCL GPU ops" | |
command: | | |
# Following https://github.com/horovod/horovod/blob/master/Dockerfile.test.gpu | |
# and https://github.com/horovod/horovod/issues/1944#issuecomment-628192778 | |
docker exec -it pthd /bin/bash -c "apt-get update && apt-get install -y git" | |
docker exec -it pthd /bin/bash -c "git clone --recursive https://github.com/horovod/horovod.git -b v0.23.0 /horovod && cd /horovod && python setup.py sdist" | |
docker exec -it pthd /bin/bash -c "conda install -y cmake nccl=2.11 -c conda-forge" | |
docker exec -it pthd /bin/bash -c 'cd /horovod && HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_NCCL_LINK=SHARED HOROVOD_WITHOUT_MPI=1 HOROVOD_WITH_PYTORCH=1 pip install -v $(ls /horovod/dist/horovod-*.tar.gz) && ldconfig' | |
docker exec -it pthd horovodrun --check-build | |
- run: | |
name: Run 1 Node 2 GPUs Unit Tests | |
command: | | |
export test_cmd='bash tests/run_gpu_tests.sh 2 hvd' | |
docker exec -it pthd /bin/bash -c "${test_cmd}" | |
# no CUDA devices Horovod tests | |
export test_cmd='CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k hvd' | |
docker exec -it pthd /bin/bash -c "${test_cmd}" | |
- run: | |
name: "Check CIFAR10 using horovodrun" | |
command: | | |
docker exec -it pthd pip install fire | |
export example_path="examples/contrib/cifar10" | |
# initial run | |
export stop_cmd="--stop_iteration=500" | |
export test_cmd="cd ${example_path} && CI=1 horovodrun -np 2 python -u main.py run --backend=horovod --checkpoint_every=200" | |
docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}" | |
# resume | |
export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt" | |
docker exec -it pthd /bin/bash -c "${test_cmd} --num_epochs=7 ${resume_opt}" | |
- run: | |
name: "Check CIFAR10 using spawn" | |
command: | | |
export example_path="examples/contrib/cifar10" | |
# initial run | |
export stop_cmd="--stop_iteration=500" | |
export test_cmd="cd ${example_path} && CI=1 python -u main.py run --backend=horovod --nproc_per_node=2 --checkpoint_every=200" | |
docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}" | |
# resume | |
export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt" | |
docker exec -it pthd /bin/bash -c "${test_cmd} --num_epochs=7 ${resume_opt}" | |
build_publish_docker_images: | |
# https://circleci.com/docs/2.0/building-docker-images/ | |
docker: | |
- image: cimg/python:3.8.8 | |
# https://circleci.com/docs/2.0/executor-types/#available-docker-resource-classes | |
resource_class: 2xlarge | |
working_directory: << pipeline.parameters.workingdir >> | |
steps: | |
- checkout | |
- setup_remote_docker: | |
version: 19.03.14 | |
docker_layer_caching: true | |
- run: | |
name: Install deps | |
command: | | |
pip --version | |
pip install docker | |
- run: | |
name: Build all Horovod flavoured PyTorch-Ignite images | |
command: | | |
cd docker | |
export PTH_VERSION=`python -c "import configparser; cfg=configparser.ConfigParser(); cfg.read('docker.cfg'); print(cfg.get('DEFAULT', 'build_docker_image_pytorch_version'))"` | |
export HVD_VERSION=`python -c "import configparser; cfg=configparser.ConfigParser(); cfg.read('docker.cfg'); print(cfg.get('DEFAULT', 'build_docker_image_hvd_version'))"` | |
bash build.sh hvd hvd-base | |
bash build.sh hvd hvd-vision | |
bash build.sh hvd hvd-nlp | |
bash build.sh hvd hvd-apex | |
bash build.sh hvd hvd-apex-vision | |
bash build.sh hvd hvd-apex-nlp | |
- run: | |
name: Build all PyTorch-Ignite images | |
command: | | |
cd docker | |
export PTH_VERSION=`python -c "import configparser; cfg=configparser.ConfigParser(); cfg.read('docker.cfg'); print(cfg.get('DEFAULT', 'build_docker_image_pytorch_version'))"` | |
bash build.sh main base | |
bash build.sh main vision | |
bash build.sh main nlp | |
bash build.sh main apex | |
bash build.sh main apex-vision | |
bash build.sh main apex-nlp | |
- run: | |
name: Build all MS DeepSpeed flavoured PyTorch-Ignite images | |
command: | | |
cd docker | |
export PTH_VERSION=`python -c "import configparser; cfg=configparser.ConfigParser(); cfg.read('docker.cfg'); print(cfg.get('DEFAULT', 'build_docker_image_pytorch_version'))"` | |
export MSDP_VERSION=`python -c "import configparser; cfg=configparser.ConfigParser(); cfg.read('docker.cfg'); print(cfg.get('DEFAULT', 'build_docker_image_msdp_version'))"` | |
bash build.sh msdp msdp-apex | |
bash build.sh msdp msdp-apex-vision | |
bash build.sh msdp msdp-apex-nlp | |
- run: | |
name: List built images | |
command: docker images | grep pytorchignite | |
- when: | |
condition: << pipeline.parameters.should_publish_docker_images >> | |
steps: | |
- run: | |
name: Push all PyTorch-Ignite Docker images | |
command: | | |
cd docker | |
sh ./push_all.sh | |
# ------------------------------------------------------------------------------------- | |
# Workflows | |
# ------------------------------------------------------------------------------------- | |
workflows: | |
version: 2 | |
gpu_tests: | |
unless: << pipeline.parameters.should_build_docker_images >> | |
jobs: | |
- one_gpu_tests | |
# Disabled windows tests as NVidia driver is too old | |
# > c:\tools\miniconda3\lib\site-packages\torch\cuda\__init__.py:52: UserWarning: CUDA initialization: The NVIDIA driver on your system is too old (found version 10010). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at ..\c10\cuda\CUDAFunctions.cpp:115.) | |
# > return torch._C._cuda_getDeviceCount() > 0 | |
# - one_gpu_windows_tests | |
# Can not run tests on 2 GPUs on Circle-CI | |
# Now, they are running on GHA self-hosted | |
# - two_gpus_tests | |
# - two_gpus_check_dist_cifar10_example | |
# - two_gpus_hvd_tests | |
docker_images: | |
when: << pipeline.parameters.should_build_docker_images >> | |
jobs: | |
- build_publish_docker_images |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Script is taken from https://circleci.com/developer/orbs/orb/roopakv/swissknife#commands-run_if_modified | |
# Usage: sh trigger_if_modified.sh <pattern> [base-branch] | |
# - for example: sh trigger_if_modified.sh "^(ignite|tests|examples|\.circleci).*" | |
if [ -z "$1" ]; then | |
echo "Pattern should be provided. Usage: sh trigger_if_modified.sh <pattern>" | |
exit 1 | |
fi | |
pattern=$1 | |
if [ -z "$2" ]; then | |
base_branch=master | |
else | |
base_branch=$2 | |
fi | |
echo "- Pattern: ${pattern}" | |
echo "- Base branch: ${base_branch}" | |
if [ -z "$BASH" ]; then | |
echo Bash not installed. | |
exit 1 | |
fi | |
git status >/dev/null 2>&1 || { echo >&2 "Not in a git directory or no git"; exit 1; } | |
circleci-agent >/dev/null 2>&1 || { echo >&2 "No Circle CI agent. These are in all Circle CI containers"; exit 1; } | |
if [ "$CIRCLE_BRANCH" == "master" ]; then | |
echo "Skip checking modified files if on master" | |
exit 0 | |
fi | |
FILES_MODIFIED="" | |
setcommit () { | |
FILES_MODIFIED=$(git diff --name-only origin/${base_branch}..HEAD | grep -i -E ${pattern}) | |
} | |
setcommit || true | |
if [ -z "$FILES_MODIFIED" ]; then | |
echo "Files not modified. Halting job" | |
circleci-agent step halt | |
else | |
echo "Files modified: ${FILES_MODIFIED}, continuing steps" | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment