3outeille/test_hf.py

## test_hf.py
import torch
from torch.nn import functional as F
from torch import distributed as dist
import os
import numpy as np
import random

def set_random_seed(seed: int):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

def init_distributed(backend: str):
    print(f"Initializing distributed backend: {backend}")
    print(f"RANK: {os.environ['RANK']}")
    print(f"WORLD_SIZE: {os.environ['WORLD_SIZE']}")
    dist.init_process_group(backend, rank=int(os.environ["RANK"]), world_size=int(os.environ["WORLD_SIZE"]))
    torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))

    # set seed
    torch.manual_seed(42)
    torch.cuda.manual_seed(42)

if __name__=="__main__":
    init_distributed(backend="nccl")
    rank = dist.get_rank()
    set_random_seed(42 + dist.get_rank())

    batch_size = 1
    in_features = 4
    out_features = 6

    X = torch.randn(batch_size, in_features, device="cuda", requires_grad=True)

    # Rank 0 brodcast X and W to other rank
    dist.broadcast(X, src=0) # OK

## test_pipegoose.py
import torch
import numpy as np
import random
import torch.distributed as dist

from pipegoose.distributed.parallel_context import ParallelContext

def set_random_seed(seed: int):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

if __name__ == "__main__":
    DATA_PARALLEL_SIZE = 1
    TENSOR_PARALLEL_SIZE = 2
    PIPELINE_PARALLEL_SIZE = 1
    SEED = 42

    torch.cuda.empty_cache()

    parallel_context = ParallelContext.from_torch(
        data_parallel_size=DATA_PARALLEL_SIZE,
        tensor_parallel_size=TENSOR_PARALLEL_SIZE,
        pipeline_parallel_size=PIPELINE_PARALLEL_SIZE,
    )

    rank = parallel_context.get_global_rank()
    set_random_seed(SEED + rank)

    batch_size = 1
    in_features = 4

    X = torch.randn(batch_size, in_features, device="cuda", requires_grad=True)

    # Rank 0 brodcast X and W to other rank
    dist.broadcast(X, src=0) # RuntimeError: a leaf Variable that requires grad is being used in an in-place operation.
	import torch
	from torch.nn import functional as F
	from torch import distributed as dist
	import os
	import numpy as np
	import random

	def set_random_seed(seed: int):
	torch.manual_seed(seed)
	if torch.cuda.is_available():
	torch.cuda.manual_seed(seed)
	np.random.seed(seed)
	random.seed(seed)

	def init_distributed(backend: str):
	print(f"Initializing distributed backend: {backend}")
	print(f"RANK: {os.environ['RANK']}")
	print(f"WORLD_SIZE: {os.environ['WORLD_SIZE']}")
	dist.init_process_group(backend, rank=int(os.environ["RANK"]), world_size=int(os.environ["WORLD_SIZE"]))
	torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))

	# set seed
	torch.manual_seed(42)
	torch.cuda.manual_seed(42)

	if __name__=="__main__":
	init_distributed(backend="nccl")
	rank = dist.get_rank()
	set_random_seed(42 + dist.get_rank())

	batch_size = 1
	in_features = 4
	out_features = 6

	X = torch.randn(batch_size, in_features, device="cuda", requires_grad=True)

	# Rank 0 brodcast X and W to other rank
	dist.broadcast(X, src=0) # OK