Skip to content

Instantly share code, notes, and snippets.

@rizar
Created January 2, 2022 21:16
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save rizar/be08b9b5b2a8aed1a24cc316ba6a6a96 to your computer and use it in GitHub Desktop.
Save rizar/be08b9b5b2a8aed1a24cc316ba6a6a96 to your computer and use it in GitHub Desktop.
Bespoke MLP Implementation for Measuring Throughput
import os
import time
import torch
from torch.nn import Linear, ReLU
from torch.profiler import profile, ProfilerActivity
from codeparrot.build_table import build_table
device = torch.device('cuda')
# gpt-2 v2
bs = 2 ** 13
d = 1600
d_ff = 4 * d
fp16_training = os.environ.get('USE_FP16') == '1'
weight_dtype = torch.float16 if os.environ.get('USE_FP16_WEIGHTS') == '1' else torch.float32
x = torch.rand((bs, d)).to(device)
class MLP(torch.nn.Module):
def __init__(self):
super().__init__()
self.lin1 = Linear(d, d_ff, dtype=weight_dtype)
self.lin2 = Linear(d_ff, d, dtype=weight_dtype)
self.out = Linear(d, 1, dtype=weight_dtype)
self.relu = ReLU()
def forward(self, x):
return self.out(x + self.lin2(self.relu(self.lin1(x))))
model = MLP()
model.to(device)
model_size = sum(p.numel() for p in model.parameters() if p.requires_grad)
torch.cuda.synchronize()
optimizer = torch.optim.SGD(lr=0.000001, params=model.parameters())
step_flops = 6 * model_size * bs
total_flops = 1000 * 10 ** 12
steps = int(total_flops / step_flops)
burnin_steps = steps // 2
assert steps > burnin_steps
print(model)
print(f'{steps} steps of with batch size {bs}')
print(f"Model_size: {model_size / 10 ** 9} billions")
print(f"One step is {step_flops / 10 ** 12} teraFLOPs")
print(f'fp16 is {fp16_training}')
for i in range(steps):
if i == burnin_steps:
after_burnin = time.time()
def compute_loss():
with torch.cuda.amp.autocast(enabled=fp16_training):
out = model(x)
loss = (out ** 2).mean()
return loss
def train():
optimizer.zero_grad()
if os.environ.get("PROFILE") == '1' and i == burnin_steps:
with profile(activities=[ProfilerActivity.CUDA], record_shapes=True, with_flops=True, use_cuda=True) as prof:
loss = compute_loss()
loss.backward()
else:
loss = compute_loss()
loss.backward()
prof = None
optimizer.step()
return prof
prof = train()
torch.cuda.synchronize()
if i == burnin_steps and prof:
break
time_took = time.time() - after_burnin
flops = 6 * model_size * bs * (steps - burnin_steps)
print(f"{time_took} seconds")
print(f"{flops / time_took / 10 ** 12} teraFLOPs per second")
if prof:
with open(f"profile_mlp_{int(time.time())}.txt", 'w') as prof_dst:
print(
build_table(
prof.key_averages(), sort_by="cuda_time_total", max_src_column_width=150, top_level_events_only=True),
file=prof_dst)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment