Skip to content

Instantly share code, notes, and snippets.

@jataylo
Created April 30, 2024 14:52
Show Gist options
  • Save jataylo/50a88745acfc817e2e2f89a41232f308 to your computer and use it in GitHub Desktop.
Save jataylo/50a88745acfc817e2e2f89a41232f308 to your computer and use it in GitHub Desktop.
DDP torch.compile hang issues
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn as nn
from torch.nn.parameter import Parameter
from torch.nn.parallel import DistributedDataParallel as DDP
from typing import Union, Optional, Callable, Tuple, List, Dict, Any
import os
os.environ["MASTER_ADDR"]="localhost"
os.environ["MASTER_PORT"]="9003"
@torch.compile
def bias_gelu_fused_(inp: torch.Tensor, bias: torch.Tensor) -> torch.Tensor:
"""Bias-GeLU fused"""
x = inp + bias
return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
def bias_gelu_fused(inp: torch.Tensor, bias: torch.Tensor) -> torch.Tensor:
"""Disable native AMP for bias_gelu_fused_"""
with torch.cuda.amp.autocast(enabled=False):
if bias.numel() != 0:
return bias_gelu_fused_(inp, bias)
return gelu_fused_(inp)
class _BiasGelu(torch.autograd.Function):
"""BiasGelu semi-top level module
Calls custom cuda extensions.
"""
@staticmethod
def forward(
ctx,
fc1_output: torch.Tensor,
fc1_bias: torch.Tensor,
) -> torch.Tensor:
gelu_out = bias_gelu_fused(fc1_output, fc1_bias)
gelu_out.requires_grad=True
ctx.save_for_backward(
fc1_output,
fc1_bias,
gelu_out,
)
return gelu_out
class BiasGelu(nn.Module):
def __init__(
self,
fc1_output_features: int,
params_dtype: Optional[torch.dtype] = None,
device: Union[torch.device, str] = "cuda",
) -> None:
super().__init__()
self.fc1_bias = Parameter(
torch.empty(fc1_output_features, device=device, dtype=params_dtype)
)
def forward(
self, fc1_output: torch.Tensor,
) -> torch.Tensor:
return _BiasGelu.apply(fc1_output, self.fc1_bias)
def example(rank, world_size):
hidden_size = 4096
ffn_hidden_size = 16384
num_attention_heads = 32
dtype = torch.float16
batch_size = 4
sequence_length = 128
attention_dropout=0.1
layernorm_epsilon=1e-5
torch.cuda.set_device(rank)
# create default process group
dist.init_process_group("nccl", rank=rank, world_size=world_size)
# create local model
te_bias_gelu = BiasGelu(
ffn_hidden_size,
params_dtype=torch.get_default_dtype(),
device="cuda",
)
te_bias_gelu.to(dtype=dtype).to(rank).cuda()
model = te_bias_gelu
# construct DDP model
ddp_model = DDP(model, device_ids=[rank])
fc1_output = torch.randn(sequence_length*batch_size, ffn_hidden_size).to(rank).cuda().to(dtype=dtype)
gelu = ddp_model(
fc1_output,
)
print(f'rank:{rank}, finished')
def main():
world_size = 2
mp.spawn(example,
args=(world_size,),
nprocs=world_size,
join=True)
if __name__=="__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment