jataylo/ddp_gelu.py

## ddp_gelu.py
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn as nn
from torch.nn.parameter import Parameter
from torch.nn.parallel import DistributedDataParallel as DDP
from typing import Union, Optional, Callable, Tuple, List, Dict, Any
import os

os.environ["MASTER_ADDR"]="localhost"
os.environ["MASTER_PORT"]="9003"

@torch.compile
def bias_gelu_fused_(inp: torch.Tensor, bias: torch.Tensor) -> torch.Tensor:
    """Bias-GeLU fused"""
    x = inp + bias
    return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))

def bias_gelu_fused(inp: torch.Tensor, bias: torch.Tensor) -> torch.Tensor:
    """Disable native AMP for bias_gelu_fused_"""
    with torch.cuda.amp.autocast(enabled=False):
        if bias.numel() != 0:
            return bias_gelu_fused_(inp, bias)
        return gelu_fused_(inp)


class _BiasGelu(torch.autograd.Function):
    """BiasGelu semi-top level module
    Calls custom cuda extensions.
    """

    @staticmethod
    def forward(
        ctx,
        fc1_output: torch.Tensor,
        fc1_bias: torch.Tensor,
    ) -> torch.Tensor:
        gelu_out = bias_gelu_fused(fc1_output, fc1_bias)
        gelu_out.requires_grad=True

        ctx.save_for_backward(
            fc1_output,
            fc1_bias,
            gelu_out,
        )
        return gelu_out


class BiasGelu(nn.Module):
    def __init__(
        self,
        fc1_output_features: int,
        params_dtype: Optional[torch.dtype] = None,
        device: Union[torch.device, str] = "cuda",
    ) -> None:
        super().__init__()
        self.fc1_bias = Parameter(
            torch.empty(fc1_output_features, device=device, dtype=params_dtype)
        )

    def forward(
        self, fc1_output: torch.Tensor,
    ) -> torch.Tensor:
        return _BiasGelu.apply(fc1_output, self.fc1_bias)


def example(rank, world_size):

    hidden_size = 4096
    ffn_hidden_size = 16384
    num_attention_heads = 32
    dtype = torch.float16
    batch_size = 4
    sequence_length = 128

    attention_dropout=0.1
    layernorm_epsilon=1e-5

    torch.cuda.set_device(rank)
    # create default process group
    dist.init_process_group("nccl", rank=rank, world_size=world_size)
    # create local model
    te_bias_gelu = BiasGelu(
        ffn_hidden_size,
        params_dtype=torch.get_default_dtype(),
        device="cuda",
    )
    te_bias_gelu.to(dtype=dtype).to(rank).cuda()
    model = te_bias_gelu
    # construct DDP model
    ddp_model = DDP(model, device_ids=[rank])
    fc1_output = torch.randn(sequence_length*batch_size, ffn_hidden_size).to(rank).cuda().to(dtype=dtype)
    gelu = ddp_model(
        fc1_output,
    )

    print(f'rank:{rank}, finished')

def main():
    world_size = 2
    mp.spawn(example,
        args=(world_size,),
        nprocs=world_size,
        join=True)

if __name__=="__main__":
    main()
	import torch
	import torch.distributed as dist
	import torch.multiprocessing as mp
	import torch.nn as nn
	from torch.nn.parameter import Parameter
	from torch.nn.parallel import DistributedDataParallel as DDP
	from typing import Union, Optional, Callable, Tuple, List, Dict, Any
	import os

	os.environ["MASTER_ADDR"]="localhost"
	os.environ["MASTER_PORT"]="9003"

	@torch.compile
	def bias_gelu_fused_(inp: torch.Tensor, bias: torch.Tensor) -> torch.Tensor:
	"""Bias-GeLU fused"""
	x = inp + bias
	return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))

	def bias_gelu_fused(inp: torch.Tensor, bias: torch.Tensor) -> torch.Tensor:
	"""Disable native AMP for bias_gelu_fused_"""
	with torch.cuda.amp.autocast(enabled=False):
	if bias.numel() != 0:
	return bias_gelu_fused_(inp, bias)
	return gelu_fused_(inp)


	class _BiasGelu(torch.autograd.Function):
	"""BiasGelu semi-top level module
	Calls custom cuda extensions.
	"""

	@staticmethod
	def forward(
	ctx,
	fc1_output: torch.Tensor,
	fc1_bias: torch.Tensor,
	) -> torch.Tensor:
	gelu_out = bias_gelu_fused(fc1_output, fc1_bias)
	gelu_out.requires_grad=True

	ctx.save_for_backward(
	fc1_output,
	fc1_bias,
	gelu_out,
	)
	return gelu_out


	class BiasGelu(nn.Module):
	def __init__(
	self,
	fc1_output_features: int,
	params_dtype: Optional[torch.dtype] = None,
	device: Union[torch.device, str] = "cuda",
	) -> None:
	super().__init__()
	self.fc1_bias = Parameter(
	torch.empty(fc1_output_features, device=device, dtype=params_dtype)
	)

	def forward(
	self, fc1_output: torch.Tensor,
	) -> torch.Tensor:
	return _BiasGelu.apply(fc1_output, self.fc1_bias)


	def example(rank, world_size):

	hidden_size = 4096
	ffn_hidden_size = 16384
	num_attention_heads = 32
	dtype = torch.float16
	batch_size = 4
	sequence_length = 128

	attention_dropout=0.1
	layernorm_epsilon=1e-5

	torch.cuda.set_device(rank)
	# create default process group
	dist.init_process_group("nccl", rank=rank, world_size=world_size)
	# create local model
	te_bias_gelu = BiasGelu(
	ffn_hidden_size,
	params_dtype=torch.get_default_dtype(),
	device="cuda",
	)
	te_bias_gelu.to(dtype=dtype).to(rank).cuda()
	model = te_bias_gelu
	# construct DDP model
	ddp_model = DDP(model, device_ids=[rank])
	fc1_output = torch.randn(sequence_length*batch_size, ffn_hidden_size).to(rank).cuda().to(dtype=dtype)
	gelu = ddp_model(
	fc1_output,
	)

	print(f'rank:{rank}, finished')

	def main():
	world_size = 2
	mp.spawn(example,
	args=(world_size,),
	nprocs=world_size,
	join=True)

	if __name__=="__main__":
	main()