Skip to content

Instantly share code, notes, and snippets.

@ryul99
Last active November 3, 2022 20:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ryul99/01c05fe49478241295f980d5c39578de to your computer and use it in GitHub Desktop.
Save ryul99/01c05fe49478241295f980d5c39578de to your computer and use it in GitHub Desktop.
Hydra DDP test
import logging
import os
import hydra
import torch
import datetime
import torch.distributed as dist
import torch.multiprocessing as mp
from omegaconf import OmegaConf
logger = logging.getLogger(os.path.basename(__file__))
def setup(cfg, rank):
os.environ["MASTER_ADDR"] = cfg.dist.master_addr
os.environ["MASTER_PORT"] = cfg.dist.master_port
timeout_sec = 1800
if cfg.dist.timeout is not None:
os.environ["NCCL_BLOCKING_WAIT"] = "1"
timeout_sec = cfg.dist.timeout
timeout = datetime.timedelta(seconds=timeout_sec)
# initialize the process group
dist.init_process_group(
cfg.dist.mode,
rank=rank,
world_size=cfg.dist.gpus,
timeout=timeout,
)
def cleanup():
dist.destroy_process_group()
def distributed_run(fn, cfg):
mp.spawn(fn, args=(cfg,), nprocs=cfg.dist.gpus, join=True)
def train_loop(rank, cfg):
setup(cfg, rank)
logger.info("Hi! I'm info from train_loop")
logger.warning("Hi! I'm warning from train_loop")
logger.error("Hi! I'm error from train_loop")
cleanup()
@hydra.main(config_path="DDP_conf.yaml")
def main(hydra_cfg):
logger.info("Hi! I'm info from main function")
logger.warning("Hi! I'm warning from main function")
logger.error("Hi! I'm error from main function")
distributed_run(train_loop, hydra_cfg)
if __name__ == "__main__":
main()
defaults:
- hydra/job_logging: colorlog
- hydra/hydra_logging: colorlog
dist:
master_addr: 'localhost'
master_port: '12355'
mode: 'nccl'
gpus: 1
timeout: 30
@omry
Copy link

omry commented Nov 6, 2020

You do not call setup, is that function related to the problem?

@omry
Copy link

omry commented Nov 6, 2020

Try to initialize the logging at the top of your training loop with something like this:

hydra_cfg = HydraConfig.instance().get()
configure_log(hydra_cfg.job_logging, hydra_cfg.verbose)

@ryul99
Copy link
Author

ryul99 commented Nov 7, 2020

I fixed the file. I forget to add setup and cleanup to train_loop. but I think this is not related to the problem

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment