Skip to content

Instantly share code, notes, and snippets.

@Shreeyak
Last active November 21, 2021 06:41
Show Gist options
  • Save Shreeyak/edfadb59b4745b9902f71396122c7e41 to your computer and use it in GitHub Desktop.
Save Shreeyak/edfadb59b4745b9902f71396122c7e41 to your computer and use it in GitHub Desktop.
When using multiple GPUs, getting weird error regarding logger. It creates a DummyLogger for processes with rank != 0. However, it fails a check for DummyLogger instance - it checks true for WandbLogger instance, which messes with the code. This script is a minimal reproducible example for this bug
"""When using multiple GPUs, getting weird error regarding logger. It creates a DummyLogger for every GPU except
the main one, which messes with the code. This script is a minimal reproducible example for this bug"""
import os
import warnings
import torch
import wandb
from torch.utils.data import DataLoader, Dataset
from pytorch_lightning import LightningModule, Trainer
from pytorch_lightning.plugins import DDPPlugin
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.utilities import rank_zero_only
class RandomDataset(Dataset):
def __init__(self, size, length):
self.len = length
self.data = torch.randn(length, size)
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return self.len
class BoringModel(LightningModule):
def __init__(self):
super().__init__()
self.layer = torch.nn.Linear(32, 2)
def forward(self, x):
return self.layer(x)
def training_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("train_loss", loss)
return {"loss": loss}
def validation_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("valid_loss", loss)
def test_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("test_loss", loss)
def configure_optimizers(self):
return torch.optim.SGD(self.layer.parameters(), lr=0.1)
def get_logger(save_dir: str = "./"):
wb_logger = pl_loggers.WandbLogger(name=None, id=None, save_dir=str(save_dir))
# Does not work: Checking for type of logger. I should be able to check for DummyLogger
if isinstance(wb_logger, pl_loggers.WandbLogger):
print("Debug logger 1 (A): ", type(wb_logger))
else:
print(f"Debug logger 1 (B): Warning - got DummyLogger", type(wb_logger))
# Use workaround instead to catch DummyLogger instances giving weird path.
if isinstance(wb_logger.experiment.dir, str):
print("Debug logger 2 (A): Got path to exp dir from WandB logger: ", wb_logger.experiment.dir)
# In my training script, I would save a config file to the exp dir created by wandblogger
else:
print("Debug logger 2 (B): Warning - got DummyLogger obj instead of path: ", wb_logger.experiment.dir)
return wb_logger
# Alternate workaround: run func only on rank zero?
@rank_zero_only
def get_logger_2(save_dir: str = "./"):
wb_logger = pl_loggers.WandbLogger(name=None, id=None, save_dir=str(save_dir))
# Normally, save file to wb_logger.experiment.dir
# Use workaround instead to catch DummyLogger instances giving weird path.
if isinstance(wb_logger.experiment.dir, str):
print("Debug logger 3 (A): Got path to exp dir from WandB logger: ", wb_logger.experiment.dir)
else:
print("Debug logger 3 (B): Warning - got DummyLogger obj instead of path: ", wb_logger.experiment.dir)
return wb_logger
def run():
warnings.filterwarnings("ignore", "^The dataloader, train_dataloader.*, does not have many workers.*")
warnings.filterwarnings("ignore", "^The dataloader, val_dataloader.*, does not have many workers.*")
warnings.filterwarnings("ignore", "^The dataloader, test_dataloader.*, does not have many workers.*")
warnings.filterwarnings("ignore", "^The number of training samples .* is smaller than the logging interval.*")
train_data = DataLoader(RandomDataset(32, 64), batch_size=2)
val_data = DataLoader(RandomDataset(32, 64), batch_size=2)
test_data = DataLoader(RandomDataset(32, 64), batch_size=2)
# Does not works: enable an experimental feature behind a flag to see if it resolves issue
wandb.require(experiment="service")
wb_logger = get_logger()
# wb_logger = get_logger_2() # Alternate workaround - use rank_zero_only
model = BoringModel()
trainer = Trainer(
default_root_dir=os.getcwd(),
limit_train_batches=1,
limit_val_batches=1,
num_sanity_val_steps=0,
max_epochs=1,
enable_model_summary=False,
gpus=2,
# DDPPlugin was used to disable `find_unused_parameters`. This annoying warning pops up even
# in the BoringModel example.
strategy=DDPPlugin(find_unused_parameters=False),
logger=wb_logger,
)
trainer.fit(model, train_dataloaders=train_data, val_dataloaders=val_data)
trainer.test(model, dataloaders=test_data)
if __name__ == "__main__":
run()
from pytorch_lightning.loggers.base import DummyLogger
# ---
""" Analyze Outputs (wandb==0.12.7, latest as of Nov 21 2021):
Expected Output:
Debug logger 1 (A): <class 'pytorch_lightning.loggers.wandb.WandbLogger'>
Debug logger 2 (A): Got path to exp dir from WandB logger: ./wandb/run-20211121_012407-3of6fq7j/files
Debug logger 1 (B): Debug logger 1 (B): Warning - got DummyLogger <class 'pytorch_lightning.loggers.base.DummyLogger'>
Debug logger 2 (B): Warning - got DummyLogger obj instead of path: <bound method DummyExperiment.nop of <pytorch_lightning.loggers.base.DummyExperiment object at 0x7f9293cf9490>>
Got Output:
Expected Output:
Debug logger 1 (A): <class 'pytorch_lightning.loggers.wandb.WandbLogger'>
Debug logger 2 (A): Got path to exp dir from WandB logger: ./wandb/run-20211121_012407-3of6fq7j/files
Debug logger 1 (A): <class 'pytorch_lightning.loggers.wandb.WandbLogger'>
Debug logger 2 (B): Warning - got DummyLogger obj instead of path: <bound method DummyExperiment.nop of <pytorch_lightning.loggers.base.DummyExperiment object at 0x7f9293cf9490>>
Somehow, 2nd GPU's logger is getting detected as an WandbLogger instance.
"""
# ---
""" Full Sample Output:
Debug logger 1 (A): <class 'pytorch_lightning.loggers.wandb.WandbLogger'>
wandb: Currently logged in as: ... <ignored>
Debug logger 2 (A): Got path to exp dir from WandB logger: ./wandb/run-20211121_012407-3of6fq7j/files
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Debug logger 1 (A): <class 'pytorch_lightning.loggers.wandb.WandbLogger'>
Debug logger 2 (B): Warning - got DummyLogger obj instead of path: <bound method DummyExperiment.nop of <pytorch_lightning.loggers.base.DummyExperiment object at 0x7f9293cf9490>>
initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/2
initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/2
----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 2 processes
----------------------------------------------------------------------------------------------------
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1]
Epoch 0: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 7.35it/s, loss=-1.15, v_num=mira]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1]
Testing: 0it [00:00, ?it/s]--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': -0.055130764842033386}
--------------------------------------------------------------------------------
Testing: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 1187.89it/s]
wandb: Waiting for W&B process to finish, PID 20907... (success).
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment