Skip to content

Instantly share code, notes, and snippets.

class GraphModule(torch.nn.Module):
def forward(self, primals_3: "f32[768]", primals_9: "f32[768]", primals_15: "f32[768]", primals_21: "f32[768]", primals_27: "f32[768]", primals_33: "f32[768]", primals_39: "f32[768]", primals_45: "f32[768]", primals_51: "f32[768]", primals_57: "f32[768]", primals_63: "f32[768]", primals_69: "f32[768]", primals_75: "f32[768]", primals_81: "f32[768]", primals_87: "f32[768]", primals_93: "f32[768]", primals_99: "f32[768]", primals_105: "f32[768]", primals_111: "f32[768]", primals_117: "f32[768]", primals_123: "f32[768]", primals_129: "f32[768]", primals_135: "f32[768]", primals_141: "f32[768]", primals_147: "f32[768]", primals_150: "i64[32, 1024]", primals_151: "i64[32, 1024]", iota: "i64[1024]", embedding: "f32[32, 1024, 768]", embedding_1: "f32[1024, 768]", getitem_1: "f32[32, 1024, 1]", rsqrt: "f32[32, 1024, 1]", view: "bf16[32768, 768]", permute_1: "bf16[32, 12, 1024, 64]", permute_2: "bf16[32, 12, 1024, 64]", permute_3: "bf16[32, 12, 1024, 64]", getitem_5: "bf16[32, 1
import torch
aten = torch.ops.aten
prims = torch.ops.prims
def fuse_scatter_upon_allzero(graph):
return # TODO
for cur_node in graph.nodes:
if cur_node.op != "call_function":
continue
import torch
from triton.testing import do_bench
import torch._inductor.config as inductor_config
from torch import nn
import copy
inductor_config.benchmark_kernel = True
inductor_config.triton.unique_kernel_names = True
torch.set_default_device("cuda")
class GraphModule(torch.nn.Module):
def forward(self, primals_4: "i64[32, 1024]", view: "bf16[32768, 768]", addmm_default: "bf16[32768, 50264]", amax: "f32[32768, 1]", log: "f32[32768, 1]", convert_element_type_5: "bf16[]", tangents_1: "bf16[]"):
# File: /home/shunting/ws/pytorch/t.py:29 in f, code: ce(model(x).view(-1, V), label.view(-1)).backward()
div_1: "bf16[]" = torch.ops.aten.div.Tensor(tangents_1, convert_element_type_5); tangents_1 = convert_element_type_5 = None
view_3: "i64[32768]" = torch.ops.aten.reshape.default(primals_4, [-1]); primals_4 = None
unsqueeze_1: "i64[32768, 1]" = torch.ops.aten.unsqueeze.default(view_3, 1); view_3 = None
ne_3: "b8[32768, 1]" = torch.ops.aten.ne.Scalar(unsqueeze_1, -100)
full_default: "i64[]" = torch.ops.aten.full.default([], 0, dtype = torch.int64, layout = torch.strided, device = device(type='cuda', index=0), pin_memory = False)
where_2: "i64[32768, 1]" = torch.ops.aten.where.self(ne_3, unsqueeze_1, fu
import torch
from triton.testing import do_bench
import torch._inductor.config as inductor_config
from torch import nn
import copy
inductor_config.benchmark_kernel = True
inductor_config.triton.unique_kernel_names = True
torch.set_default_device("cuda")
2024-06-18T08:49:53.7630888Z cuda train mobilenetv3_large_100
2024-06-18T08:50:54.3288012Z W0618 08:50:54.326000 139762820854976 torch/_logging/_internal.py:1034] [6/0] Profiler function <class 'torch.autograd.profiler.record_function'> will be ignored
2024-06-18T08:52:18.3734367Z E0618 08:52:18.370000 139762820854976 torch/_dynamo/utils.py:1478] RMSE (res-fp64): 0.22641, (ref-fp64): 0.05045 and shape=torch.Size([]). res.dtype: torch.float32, multiplier: 3.000000, tol: 0.040000
2024-06-18T08:52:18.3752389Z fail_accuracy
2024-06-18T08:30:36.4229101Z W0618 08:30:36.422000 139897809662592 torch/_logging/_internal.py:1034] [6/0] Profiler function <class 'torch.autograd.profiler.record_function'> will be ignored
2024-06-18T08:31:24.7323180Z E0618 08:31:24.731000 139897809662592 torch/_dynamo/utils.py:1478] RMSE (res-fp64): 0.02052, (ref-fp64): 0.00794 and shape=torch.Size([1000, 2048]). res.dtype: torch.float32, multiplier: 2.000000, tol: 0.010000
2024-06-18T08:31:24.7326002Z E0618 08:31:24.731000 139897809662592 torch/_dynamo/utils.py:1392] Accuracy failed for key name head.fc.weight
2024-06-18T08:45:31.1992993Z W0618 08:45:31.198000 140240420991616 torch/_logging/_internal.py:1034] [6/0] Profiler function <class 'torch.autograd.profiler.record_function'> will be ignored
2024-06-18T08:46:16.4532513Z E0618 08:46:16.452000 140240420991616 torch/_dynamo/utils.py:1478] RMSE (res-fp64): 0.03031, (ref-fp64): 0.00766 and shape=torch.Size([1000, 2048]). res.dtype: torch.float32, multiplier: 2.000000, tol: 0.010000
2024-06-18T08:46:16.4534101Z E0618 08:46:16.452000 140240420991616 torch/_dynamo/utils.py:1392] Accuracy failed for key name head.fc.weight
import torch
from triton.testing import do_bench
from torch import nn
import math
torch.set_default_device("cuda")
V = 2048
hidden_size = 64
max_seqlen = 512
import torch
from triton.testing import do_bench
from torch import nn
import math
torch.set_default_device("cuda")
V = 30522
hidden_size = 768