rizar/residual_mlp.py

## residual_mlp.py
import os
import time
import torch
from torch.nn import Linear, ReLU
from torch.profiler import profile, ProfilerActivity

from codeparrot.build_table import build_table

device = torch.device('cuda')

# gpt-2 v2
bs = 2 ** 13
d = 1600
d_ff = 4 * d

fp16_training = os.environ.get('USE_FP16') == '1'
weight_dtype = torch.float16 if os.environ.get('USE_FP16_WEIGHTS') == '1' else torch.float32

x = torch.rand((bs, d)).to(device)
class MLP(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.lin1 = Linear(d, d_ff, dtype=weight_dtype)
        self.lin2 = Linear(d_ff, d, dtype=weight_dtype)
        self.out = Linear(d, 1, dtype=weight_dtype)
        self.relu = ReLU()
    def forward(self, x):
        return self.out(x + self.lin2(self.relu(self.lin1(x))))
model = MLP()
model.to(device)
model_size = sum(p.numel() for p in model.parameters() if p.requires_grad)
torch.cuda.synchronize()
optimizer = torch.optim.SGD(lr=0.000001, params=model.parameters())

step_flops = 6 * model_size * bs
total_flops = 1000 * 10 ** 12
steps = int(total_flops / step_flops)
burnin_steps = steps // 2
assert steps > burnin_steps
print(model)
print(f'{steps} steps of with batch size {bs}')

print(f"Model_size: {model_size / 10 ** 9} billions")
print(f"One step is {step_flops / 10 ** 12} teraFLOPs")
print(f'fp16 is {fp16_training}')

for i in range(steps):
    if i == burnin_steps:
        after_burnin = time.time()
    def compute_loss():
        with torch.cuda.amp.autocast(enabled=fp16_training):
            out = model(x)
            loss = (out ** 2).mean()
        return loss
    def train():
        optimizer.zero_grad()
        if os.environ.get("PROFILE") == '1' and i == burnin_steps:
            with profile(activities=[ProfilerActivity.CUDA], record_shapes=True, with_flops=True, use_cuda=True) as prof:
                loss = compute_loss()
                loss.backward()
        else:
            loss = compute_loss()
            loss.backward()
            prof = None
        optimizer.step()
        return prof
    prof = train()
    torch.cuda.synchronize()
    if i == burnin_steps and prof:
        break

time_took = time.time() - after_burnin
flops = 6 * model_size * bs * (steps - burnin_steps)
print(f"{time_took} seconds")
print(f"{flops / time_took / 10 ** 12} teraFLOPs per second")

if prof:
    with open(f"profile_mlp_{int(time.time())}.txt", 'w') as prof_dst:
        print(
            build_table(
                prof.key_averages(), sort_by="cuda_time_total", max_src_column_width=150, top_level_events_only=True),
            file=prof_dst)
	import os
	import time
	import torch
	from torch.nn import Linear, ReLU
	from torch.profiler import profile, ProfilerActivity

	from codeparrot.build_table import build_table

	device = torch.device('cuda')

	# gpt-2 v2
	bs = 2 ** 13
	d = 1600
	d_ff = 4 * d

	fp16_training = os.environ.get('USE_FP16') == '1'
	weight_dtype = torch.float16 if os.environ.get('USE_FP16_WEIGHTS') == '1' else torch.float32

	x = torch.rand((bs, d)).to(device)
	class MLP(torch.nn.Module):
	def __init__(self):
	super().__init__()
	self.lin1 = Linear(d, d_ff, dtype=weight_dtype)
	self.lin2 = Linear(d_ff, d, dtype=weight_dtype)
	self.out = Linear(d, 1, dtype=weight_dtype)
	self.relu = ReLU()
	def forward(self, x):
	return self.out(x + self.lin2(self.relu(self.lin1(x))))
	model = MLP()
	model.to(device)
	model_size = sum(p.numel() for p in model.parameters() if p.requires_grad)
	torch.cuda.synchronize()
	optimizer = torch.optim.SGD(lr=0.000001, params=model.parameters())

	step_flops = 6 * model_size * bs
	total_flops = 1000 * 10 ** 12
	steps = int(total_flops / step_flops)
	burnin_steps = steps // 2
	assert steps > burnin_steps
	print(model)
	print(f'{steps} steps of with batch size {bs}')

	print(f"Model_size: {model_size / 10 ** 9} billions")
	print(f"One step is {step_flops / 10 ** 12} teraFLOPs")
	print(f'fp16 is {fp16_training}')

	for i in range(steps):
	if i == burnin_steps:
	after_burnin = time.time()
	def compute_loss():
	with torch.cuda.amp.autocast(enabled=fp16_training):
	out = model(x)
	loss = (out ** 2).mean()
	return loss
	def train():
	optimizer.zero_grad()
	if os.environ.get("PROFILE") == '1' and i == burnin_steps:
	with profile(activities=[ProfilerActivity.CUDA], record_shapes=True, with_flops=True, use_cuda=True) as prof:
	loss = compute_loss()
	loss.backward()
	else:
	loss = compute_loss()
	loss.backward()
	prof = None
	optimizer.step()
	return prof
	prof = train()
	torch.cuda.synchronize()
	if i == burnin_steps and prof:
	break

	time_took = time.time() - after_burnin
	flops = 6 * model_size * bs * (steps - burnin_steps)
	print(f"{time_took} seconds")
	print(f"{flops / time_took / 10 ** 12} teraFLOPs per second")

	if prof:
	with open(f"profile_mlp_{int(time.time())}.txt", 'w') as prof_dst:
	print(
	build_table(
	prof.key_averages(), sort_by="cuda_time_total", max_src_column_width=150, top_level_events_only=True),
	file=prof_dst)