soulitzer/compile_nt_prof.py

## compile_nt_prof.py
import torch
from torch.nested._internal.nested_tensor import NestedTensor, jagged_from_list
from torch.profiler import profile, record_function, ProfilerActivity

device="cuda:5"

for nb_unit in (10, 1, 2, 5, 20):
    lin = torch.nn.functional.linear

    def sin(x):
        return x.sin().cos()

    def unit(x):
        return lin(sin(lin(sin(x), e)), d)

    def fn1(nt1, nt2):
        out = nt1 + nt2
        for i in range(nb_unit):
            out = unit(out)
        return out

    if nb_unit == 10:
        tensor_sizes = [2**(n*2) for n in range(10, 4, -1)]
    else:
        tensor_sizes = [2**14, 2**20]

    for D in tensor_sizes:
        print(f"{nb_unit} units, {D} elements")
        a = torch.randn(20, D, dtype=torch.float32, device=device)
        b = torch.randn(30, D, dtype=torch.float32, device=device)
        c = torch.randn(40, D, dtype=torch.float32, device=device)
        d = torch.randn(D, 256, dtype=torch.float32, device=device)
        e = torch.randn(256, D, dtype=torch.float32, device=device)

        nt, offsets = jagged_from_list([a, b, c], None)
        nt = nt.detach().requires_grad_(True)
        nt2, _ = jagged_from_list([a, b, c], offsets)
        nt2 = nt2.detach().requires_grad_(True)
        nt3, _ = jagged_from_list([a, b, c], offsets)
        lin = torch.nn.functional.linear

        with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
            compiled_f = torch.compile(fn1, fullgraph=True, dynamic=True)
            for i in range(10):
                out = compiled_f(nt, nt2)
                ga, gb = torch.autograd.grad(out, grad_outputs=(nt3,), inputs=(nt, nt2))
                torch.cuda.synchronize(device=device)

        prof.export_chrome_trace(f"traces/nt_compile_trace_{nb_unit}_{D}.json")

        with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
            for i in range(10):
                out_ref = fn1(nt, nt2)
                ga_ref, gb_ref = torch.autograd.grad(out_ref, grad_outputs=(nt3,), inputs=(nt, nt2))
                torch.cuda.synchronize(device=device)

        prof.export_chrome_trace(f"traces/nt_no_compile_trace_{nb_unit}_{D}.json")

        nt = torch.nested.nested_tensor([a, b, c], requires_grad=True)
        nt2 = torch.nested.nested_tensor([a, b, c], requires_grad=True)
        nt3 = torch.nested.nested_tensor([a, b, c], requires_grad=True)

        with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
            for i in range(20):
                out = fn1(nt, nt2)
                ga, gb = torch.autograd.grad(out, grad_outputs=(nt3,), inputs=(nt, nt2))
                torch.cuda.synchronize(device=device)

        prof.export_chrome_trace(f"traces/nt_cpp_trace_{nb_unit}_{D}.json")
	import torch
	from torch.nested._internal.nested_tensor import NestedTensor, jagged_from_list
	from torch.profiler import profile, record_function, ProfilerActivity

	device="cuda:5"

	for nb_unit in (10, 1, 2, 5, 20):
	lin = torch.nn.functional.linear

	def sin(x):
	return x.sin().cos()

	def unit(x):
	return lin(sin(lin(sin(x), e)), d)

	def fn1(nt1, nt2):
	out = nt1 + nt2
	for i in range(nb_unit):
	out = unit(out)
	return out

	if nb_unit == 10:
	tensor_sizes = [2*(n2) for n in range(10, 4, -1)]
	else:
	tensor_sizes = [214, 220]

	for D in tensor_sizes:
	print(f"{nb_unit} units, {D} elements")
	a = torch.randn(20, D, dtype=torch.float32, device=device)
	b = torch.randn(30, D, dtype=torch.float32, device=device)
	c = torch.randn(40, D, dtype=torch.float32, device=device)
	d = torch.randn(D, 256, dtype=torch.float32, device=device)
	e = torch.randn(256, D, dtype=torch.float32, device=device)

	nt, offsets = jagged_from_list([a, b, c], None)
	nt = nt.detach().requires_grad_(True)
	nt2, _ = jagged_from_list([a, b, c], offsets)
	nt2 = nt2.detach().requires_grad_(True)
	nt3, _ = jagged_from_list([a, b, c], offsets)
	lin = torch.nn.functional.linear

	with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
	compiled_f = torch.compile(fn1, fullgraph=True, dynamic=True)
	for i in range(10):
	out = compiled_f(nt, nt2)
	ga, gb = torch.autograd.grad(out, grad_outputs=(nt3,), inputs=(nt, nt2))
	torch.cuda.synchronize(device=device)

	prof.export_chrome_trace(f"traces/nt_compile_trace_{nb_unit}_{D}.json")

	with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
	for i in range(10):
	out_ref = fn1(nt, nt2)
	ga_ref, gb_ref = torch.autograd.grad(out_ref, grad_outputs=(nt3,), inputs=(nt, nt2))
	torch.cuda.synchronize(device=device)

	prof.export_chrome_trace(f"traces/nt_no_compile_trace_{nb_unit}_{D}.json")

	nt = torch.nested.nested_tensor([a, b, c], requires_grad=True)
	nt2 = torch.nested.nested_tensor([a, b, c], requires_grad=True)
	nt3 = torch.nested.nested_tensor([a, b, c], requires_grad=True)

	with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
	for i in range(20):
	out = fn1(nt, nt2)
	ga, gb = torch.autograd.grad(out, grad_outputs=(nt3,), inputs=(nt, nt2))
	torch.cuda.synchronize(device=device)

	prof.export_chrome_trace(f"traces/nt_cpp_trace_{nb_unit}_{D}.json")