Skip to content

Instantly share code, notes, and snippets.

@soulitzer
Created October 23, 2023 19:20
Show Gist options
  • Save soulitzer/9fb272ade0442f83f79c8e79b1fa5e43 to your computer and use it in GitHub Desktop.
Save soulitzer/9fb272ade0442f83f79c8e79b1fa5e43 to your computer and use it in GitHub Desktop.
import torch
from torch.nested._internal.nested_tensor import NestedTensor, jagged_from_list
from torch.profiler import profile, record_function, ProfilerActivity
device="cuda:5"
for nb_unit in (10, 1, 2, 5, 20):
lin = torch.nn.functional.linear
def sin(x):
return x.sin().cos()
def unit(x):
return lin(sin(lin(sin(x), e)), d)
def fn1(nt1, nt2):
out = nt1 + nt2
for i in range(nb_unit):
out = unit(out)
return out
if nb_unit == 10:
tensor_sizes = [2**(n*2) for n in range(10, 4, -1)]
else:
tensor_sizes = [2**14, 2**20]
for D in tensor_sizes:
print(f"{nb_unit} units, {D} elements")
a = torch.randn(20, D, dtype=torch.float32, device=device)
b = torch.randn(30, D, dtype=torch.float32, device=device)
c = torch.randn(40, D, dtype=torch.float32, device=device)
d = torch.randn(D, 256, dtype=torch.float32, device=device)
e = torch.randn(256, D, dtype=torch.float32, device=device)
nt, offsets = jagged_from_list([a, b, c], None)
nt = nt.detach().requires_grad_(True)
nt2, _ = jagged_from_list([a, b, c], offsets)
nt2 = nt2.detach().requires_grad_(True)
nt3, _ = jagged_from_list([a, b, c], offsets)
lin = torch.nn.functional.linear
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
compiled_f = torch.compile(fn1, fullgraph=True, dynamic=True)
for i in range(10):
out = compiled_f(nt, nt2)
ga, gb = torch.autograd.grad(out, grad_outputs=(nt3,), inputs=(nt, nt2))
torch.cuda.synchronize(device=device)
prof.export_chrome_trace(f"traces/nt_compile_trace_{nb_unit}_{D}.json")
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
for i in range(10):
out_ref = fn1(nt, nt2)
ga_ref, gb_ref = torch.autograd.grad(out_ref, grad_outputs=(nt3,), inputs=(nt, nt2))
torch.cuda.synchronize(device=device)
prof.export_chrome_trace(f"traces/nt_no_compile_trace_{nb_unit}_{D}.json")
nt = torch.nested.nested_tensor([a, b, c], requires_grad=True)
nt2 = torch.nested.nested_tensor([a, b, c], requires_grad=True)
nt3 = torch.nested.nested_tensor([a, b, c], requires_grad=True)
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
for i in range(20):
out = fn1(nt, nt2)
ga, gb = torch.autograd.grad(out, grad_outputs=(nt3,), inputs=(nt, nt2))
torch.cuda.synchronize(device=device)
prof.export_chrome_trace(f"traces/nt_cpp_trace_{nb_unit}_{D}.json")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment