jayelm/auto_gpu_example.py

## auto_gpu_example.py
"""
Auto GPU example with Hydra. To run, first

pip install hydra hydra-joblib-launcher --upgrade

Then run as follows

```
python auto_gpu_example.py -m n=1,2,3
```

where `n` is the parameter you want to sweep over (any arbitrary Hydra sweep is
supported).

This will launch 3 jobs in parallel, with a staggered start defined by
`sweep_sleep_time` in the config (default 60s). Each job will wait until a GPU
is free, i.e. below `threshold_vram_usage` memory utilization, then assign this
GPU to the CUDA_VISIBLE_DEVICES environment variable.

The purpose of `sweep_sleep_time` is to give time for earlier jobs to occupy
the GPUs they selected. If all jobs started instantaneously, they would all
pick the same GPU before actually starting to utilize the GPU.

NOTE: If you are using wandb you may need to set as WANDB_START_METHOD=thread as
discussed here: https://github.com/wandb/client/issues/1525

FIXME: Unfortunately, it's still possible to have multiple jobs waiting for a
free GPU, and for them to select the same GPU at the same time. How to solve
this is TBD...
"""

import logging
import os
import subprocess
import time

import hydra
import omegaconf
import torch
from hydra.core.hydra_config import HydraConfig

logger = logging.getLogger(__name__)


def assign_free_gpus(threshold_vram_usage=500, max_gpus=2, wait=False, sleep_time=10):
    """
    Assigns free gpus to the current process via the CUDA_AVAILABLE_DEVICES env variable
    This function should be called after all imports,
    in case you are setting CUDA_AVAILABLE_DEVICES elsewhere

    Borrowed and fixed from https://gist.github.com/afspies/7e211b83ca5a8902849b05ded9a10696

    Args:
        threshold_vram_usage (int, optional): A GPU is considered free if the vram usage is below the threshold
                                              Defaults to 500 (MiB).
        max_gpus (int, optional): Max GPUs is the maximum number of gpus to assign.
                                  Defaults to 2.
        wait (bool, optional): Whether to wait until a GPU is free. Default False.
        sleep_time (int, optional): Sleep time (in seconds) to wait before checking GPUs, if wait=True. Default 10.
    """

    def _check():
        # Get the list of GPUs via nvidia-smi
        smi_query_result = subprocess.check_output(
            "nvidia-smi -q -d Memory | grep -A4 GPU", shell=True
        )
        # Extract the usage information
        gpu_info = smi_query_result.decode("utf-8").split("\n")
        gpu_info = list(filter(lambda info: "Used" in info, gpu_info))
        gpu_info = [
            int(x.split(":")[1].replace("MiB", "").strip()) for x in gpu_info
        ]  # Remove garbage
        # Keep gpus under threshold only
        free_gpus = [
            str(i) for i, mem in enumerate(gpu_info) if mem < threshold_vram_usage
        ]
        free_gpus = free_gpus[: min(max_gpus, len(free_gpus))]
        gpus_to_use = ",".join(free_gpus)
        return gpus_to_use

    while True:
        gpus_to_use = _check()
        if gpus_to_use or not wait:
            break
        logger.info("No free GPUs found, retrying in %ds", sleep_time)
        time.sleep(sleep_time)

    if not gpus_to_use:
        raise RuntimeError("No free GPUs found")
    os.environ["CUDA_VISIBLE_DEVICES"] = gpus_to_use
    logger.info("Using GPU(s): %s", gpus_to_use)


def setup_gpus(cfg):
    if cfg.device == "auto":
        hydra_cfg = HydraConfig.get()
        try:
            job_num = hydra_cfg.job.num
            total_sleep_time = job_num * cfg.sweep_sleep_time
            logger.info(
                "Detected that I am job #%s in a Hydra sweep. Sleeping for %d seconds to let other jobs occupy GPUs.",
                job_num,
                total_sleep_time,
            )
            time.sleep(total_sleep_time)
        except omegaconf.errors.MissingMandatoryValue:
            # Not run within a hydra sweep, ignore
            logger.info("Not part of a hydra sweep. Searching for a free GPU")
        assign_free_gpus(
            threshold_vram_usage=500,
            max_gpus=1,
            wait=True,
        )
        cfg.device = "cuda:0"


@hydra.main(config_path=".", config_name="config")
def main(cfg):
    setup_gpus(cfg)
    _ = torch.zeros((4000, 4000)).to(cfg.device)
    time.sleep(60)
    print(f"Finished run {cfg.n} on device {os.environ['CUDA_VISIBLE_DEVICES']}")


if __name__ == "__main__":
    main()

## config.yaml
defaults:
  - override /hydra/launcher: joblib


# Dummy sweep parameter
n: 1

device: auto  # either torch.Device or "auto", which will by default find 1 free GPU
sweep_sleep_time: 30  # how many seconds to wait for subsequent job sweeps, per job id #.

## stdout
[2022-04-09 18:28:18,133][HYDRA] Joblib.Parallel(n_jobs=-1,backend=loky,prefer=processes,require=None,verbose=0,time
out=None,pre_dispatch=2*n_jobs,batch_size=auto,temp_folder=None,max_nbytes=None,mmap_mode=r) is launching 3 jobs
[2022-04-09 18:28:18,148][HYDRA] Launching jobs, sweep output dir : multirun/2022-04-09/18-28-16
[2022-04-09 18:28:18,150][HYDRA]        #0 : n=1
[2022-04-09 18:28:18,152][HYDRA]        #1 : n=2
[2022-04-09 18:28:18,154][HYDRA]        #2 : n=3
[2022-04-09 18:28:22,669][__main__][INFO] - Detected that I am job #2 in a Hydra sweep. Sleeping for 60 seconds to l
et other jobs occupy GPUs.
[2022-04-09 18:28:22,756][__main__][INFO] - Detected that I am job #0 in a Hydra sweep. Sleeping for 0 seconds to le
t other jobs occupy GPUs.
[2022-04-09 18:28:22,795][__main__][INFO] - Detected that I am job #1 in a Hydra sweep. Sleeping for 30 seconds to l
et other jobs occupy GPUs.
[2022-04-09 18:28:22,931][__main__][INFO] - Using GPU(s): 4
[2022-04-09 18:28:52,956][__main__][INFO] - Using GPU(s): 5
[2022-04-09 18:29:22,905][__main__][INFO] - Using GPU(s): 6
Finished run 1 on device 4
Finished run 2 on device 5
Finished run 3 on device 6
	"""
	Auto GPU example with Hydra. To run, first

	pip install hydra hydra-joblib-launcher --upgrade

	Then run as follows

	```
	python auto_gpu_example.py -m n=1,2,3
	```

	where `n` is the parameter you want to sweep over (any arbitrary Hydra sweep is
	supported).

	This will launch 3 jobs in parallel, with a staggered start defined by
	`sweep_sleep_time` in the config (default 60s). Each job will wait until a GPU
	is free, i.e. below `threshold_vram_usage` memory utilization, then assign this
	GPU to the CUDA_VISIBLE_DEVICES environment variable.

	The purpose of `sweep_sleep_time` is to give time for earlier jobs to occupy
	the GPUs they selected. If all jobs started instantaneously, they would all
	pick the same GPU before actually starting to utilize the GPU.

	NOTE: If you are using wandb you may need to set as WANDB_START_METHOD=thread as
	discussed here: https://github.com/wandb/client/issues/1525

	FIXME: Unfortunately, it's still possible to have multiple jobs waiting for a
	free GPU, and for them to select the same GPU at the same time. How to solve
	this is TBD...
	"""

	import logging
	import os
	import subprocess
	import time

	import hydra
	import omegaconf
	import torch
	from hydra.core.hydra_config import HydraConfig

	logger = logging.getLogger(__name__)


	def assign_free_gpus(threshold_vram_usage=500, max_gpus=2, wait=False, sleep_time=10):
	"""
	Assigns free gpus to the current process via the CUDA_AVAILABLE_DEVICES env variable
	This function should be called after all imports,
	in case you are setting CUDA_AVAILABLE_DEVICES elsewhere

	Borrowed and fixed from https://gist.github.com/afspies/7e211b83ca5a8902849b05ded9a10696

	Args:
	threshold_vram_usage (int, optional): A GPU is considered free if the vram usage is below the threshold
	Defaults to 500 (MiB).
	max_gpus (int, optional): Max GPUs is the maximum number of gpus to assign.
	Defaults to 2.
	wait (bool, optional): Whether to wait until a GPU is free. Default False.
	sleep_time (int, optional): Sleep time (in seconds) to wait before checking GPUs, if wait=True. Default 10.
	"""

	def _check():
	# Get the list of GPUs via nvidia-smi
	smi_query_result = subprocess.check_output(
	"nvidia-smi -q -d Memory \| grep -A4 GPU", shell=True
	)
	# Extract the usage information
	gpu_info = smi_query_result.decode("utf-8").split("\n")
	gpu_info = list(filter(lambda info: "Used" in info, gpu_info))
	gpu_info = [
	int(x.split(":")[1].replace("MiB", "").strip()) for x in gpu_info
	] # Remove garbage
	# Keep gpus under threshold only
	free_gpus = [
	str(i) for i, mem in enumerate(gpu_info) if mem < threshold_vram_usage
	]
	free_gpus = free_gpus[: min(max_gpus, len(free_gpus))]
	gpus_to_use = ",".join(free_gpus)
	return gpus_to_use

	while True:
	gpus_to_use = _check()
	if gpus_to_use or not wait:
	break
	logger.info("No free GPUs found, retrying in %ds", sleep_time)
	time.sleep(sleep_time)

	if not gpus_to_use:
	raise RuntimeError("No free GPUs found")
	os.environ["CUDA_VISIBLE_DEVICES"] = gpus_to_use
	logger.info("Using GPU(s): %s", gpus_to_use)


	def setup_gpus(cfg):
	if cfg.device == "auto":
	hydra_cfg = HydraConfig.get()
	try:
	job_num = hydra_cfg.job.num
	total_sleep_time = job_num * cfg.sweep_sleep_time
	logger.info(
	"Detected that I am job #%s in a Hydra sweep. Sleeping for %d seconds to let other jobs occupy GPUs.",
	job_num,
	total_sleep_time,
	)
	time.sleep(total_sleep_time)
	except omegaconf.errors.MissingMandatoryValue:
	# Not run within a hydra sweep, ignore
	logger.info("Not part of a hydra sweep. Searching for a free GPU")
	assign_free_gpus(
	threshold_vram_usage=500,
	max_gpus=1,
	wait=True,
	)
	cfg.device = "cuda:0"


	@hydra.main(config_path=".", config_name="config")
	def main(cfg):
	setup_gpus(cfg)
	_ = torch.zeros((4000, 4000)).to(cfg.device)
	time.sleep(60)
	print(f"Finished run {cfg.n} on device {os.environ['CUDA_VISIBLE_DEVICES']}")


	if __name__ == "__main__":
	main()
	defaults:
	- override /hydra/launcher: joblib


	# Dummy sweep parameter
	n: 1

	device: auto # either torch.Device or "auto", which will by default find 1 free GPU
	sweep_sleep_time: 30 # how many seconds to wait for subsequent job sweeps, per job id #.
	[2022-04-09 18:28:18,133][HYDRA] Joblib.Parallel(n_jobs=-1,backend=loky,prefer=processes,require=None,verbose=0,time
	out=None,pre_dispatch=2*n_jobs,batch_size=auto,temp_folder=None,max_nbytes=None,mmap_mode=r) is launching 3 jobs
	[2022-04-09 18:28:18,148][HYDRA] Launching jobs, sweep output dir : multirun/2022-04-09/18-28-16
	[2022-04-09 18:28:18,150][HYDRA] #0 : n=1
	[2022-04-09 18:28:18,152][HYDRA] #1 : n=2
	[2022-04-09 18:28:18,154][HYDRA] #2 : n=3
	[2022-04-09 18:28:22,669][__main__][INFO] - Detected that I am job #2 in a Hydra sweep. Sleeping for 60 seconds to l
	et other jobs occupy GPUs.
	[2022-04-09 18:28:22,756][__main__][INFO] - Detected that I am job #0 in a Hydra sweep. Sleeping for 0 seconds to le
	t other jobs occupy GPUs.
	[2022-04-09 18:28:22,795][__main__][INFO] - Detected that I am job #1 in a Hydra sweep. Sleeping for 30 seconds to l
	et other jobs occupy GPUs.
	[2022-04-09 18:28:22,931][__main__][INFO] - Using GPU(s): 4
	[2022-04-09 18:28:52,956][__main__][INFO] - Using GPU(s): 5
	[2022-04-09 18:29:22,905][__main__][INFO] - Using GPU(s): 6
	Finished run 1 on device 4
	Finished run 2 on device 5
	Finished run 3 on device 6