Skip to content

Instantly share code, notes, and snippets.

@3outeille
Last active December 11, 2023 11:48
Show Gist options
  • Save 3outeille/5154ab004c76c11f379c9c337b3e49ef to your computer and use it in GitHub Desktop.
Save 3outeille/5154ab004c76c11f379c9c337b3e49ef to your computer and use it in GitHub Desktop.
RuntimeError: a leaf Variable that requires grad is being used in an in-place operation.
import torch
from torch.nn import functional as F
from torch import distributed as dist
import os
import numpy as np
import random
def set_random_seed(seed: int):
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
def init_distributed(backend: str):
print(f"Initializing distributed backend: {backend}")
print(f"RANK: {os.environ['RANK']}")
print(f"WORLD_SIZE: {os.environ['WORLD_SIZE']}")
dist.init_process_group(backend, rank=int(os.environ["RANK"]), world_size=int(os.environ["WORLD_SIZE"]))
torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
# set seed
torch.manual_seed(42)
torch.cuda.manual_seed(42)
if __name__=="__main__":
init_distributed(backend="nccl")
rank = dist.get_rank()
set_random_seed(42 + dist.get_rank())
batch_size = 1
in_features = 4
out_features = 6
X = torch.randn(batch_size, in_features, device="cuda", requires_grad=True)
# Rank 0 brodcast X and W to other rank
dist.broadcast(X, src=0) # OK
import torch
import numpy as np
import random
import torch.distributed as dist
from pipegoose.distributed.parallel_context import ParallelContext
def set_random_seed(seed: int):
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
if __name__ == "__main__":
DATA_PARALLEL_SIZE = 1
TENSOR_PARALLEL_SIZE = 2
PIPELINE_PARALLEL_SIZE = 1
SEED = 42
torch.cuda.empty_cache()
parallel_context = ParallelContext.from_torch(
data_parallel_size=DATA_PARALLEL_SIZE,
tensor_parallel_size=TENSOR_PARALLEL_SIZE,
pipeline_parallel_size=PIPELINE_PARALLEL_SIZE,
)
rank = parallel_context.get_global_rank()
set_random_seed(SEED + rank)
batch_size = 1
in_features = 4
X = torch.randn(batch_size, in_features, device="cuda", requires_grad=True)
# Rank 0 brodcast X and W to other rank
dist.broadcast(X, src=0) # RuntimeError: a leaf Variable that requires grad is being used in an in-place operation.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment