Skip to content

Instantly share code, notes, and snippets.

(pytorch) [shunting@devgpu002.lla3 ~/ws/pytorch (loaf)]$ python benchmarks/dynamo/torchbench.py --ci --accuracy --timing --explain --export-aot-inductor --device cuda --inference --bfloat16 --only sam_fast
loading model: 0it [00:00, ?it/s]INFO:root:running build_ext
INFO:root:running build_ext
INFO:root:running build_ext
INFO:root:running build_ext
INFO:root:running build_ext
INFO:root:running build_ext
INFO:root:running build_ext
INFO:root:running build_ext
INFO:root:running build_ext
Thread 227 "pt_autograd_0" received signal SIGSEGV, Segmentation fault.
[Switching to LWP 1007598]
0x00007ffff7c8cee4 in pthread_mutex_lock@@GLIBC_2.2.5 () from /lib64/libc.so.6
(gdb) bt
#0 0x00007ffff7c8cee4 in pthread_mutex_lock@@GLIBC_2.2.5 () from /lib64/libc.so.6
#1 0x00007ffe1029189c in torch::autograd::ForwardGrad::clear() () from /home/shunting/ws/vision/torchvision/_C.so
#2 0x00007ffe102a6565 in torch::autograd::CppNode<vision::ops::(anonymous namespace)::ROIAlignFunction>::release_variables() ()
from /home/shunting/ws/vision/torchvision/_C.so
#3 0x00007fffedf68dd2 in torch::autograd::Engine::evaluate_function(std::shared_ptr<torch::autograd::GraphTask>&, torch::autograd::Node*, torch::autograd::InputBuffe
(pytorch) [shunting@devgpu005.nha1 ~/ws/pytorch (acc)]$ time python benchmarks/dynamo/torchbench.py --performance --training --amp --backend inductor --disable-cudagr
aphs --device cuda --only vision_maskrcnn
loading model: 0it [00:05, ?it/s]
cuda train vision_maskrcnn
Traceback (most recent call last):
File "/home/shunting/ws/pytorch/benchmarks/dynamo/common.py", line 2335, in validate_model
self.model_iter_fn(model, example_inputs)
File "/home/shunting/ws/pytorch/benchmarks/dynamo/torchbench.py", line 466, in forward_and_backward_pass
pred = mod(*cloned_inputs)
File "/home/shunting/ws/pytorch/torch/nn/modules/module.py", line 1716, in _wrapped_call_impl
2024-07-04T23:20:39.8000729Z loading model: 0it [00:00, ?it/s]WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead
2024-07-04T23:20:39.9484088Z
2024-07-04T23:20:39.9485124Z loading model: 0it [00:01, ?it/s]
2024-07-04T23:20:39.9486440Z WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead
2024-07-04T23:20:39.9487446Z cuda eval pyhpc_turbulent_kinetic_energy
2024-07-04T23:20:39.9743673Z WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead
2024-07-04T23:21:01.2023259Z ERROR:common:
2024-07-04T23:21:01.2024509Z Traceback (most recent call last):
2024-07-04T23:21:01.2025803Z File "/var/lib/jenkins/workspace/benchmarks/dynamo/common.py", line 2642, in check_accuracy
2024-07-04T23:21:01.2027561Z new_result = optimized_model_iter_fn(model_copy, example_inputs)
(pytorch) [shunting@devgpu005.nha1 ~/ws/pytorch (acc-sebotnet33ts_256)]$ TORCHINDUCTOR_MAX_AUTOTUNE=1 time python benchmarks/dynamo/huggingface.py --accuracy --no-translation-validation --training --amp --backend inductor --device cuda --only LayoutLMForMaskedLM
loading model: 0it [00:06, ?it/s]
cuda train LayoutLMForMaskedLM
AUTOTUNE addmm(512x3072, 512x768, 768x3072)
triton_mm_130 0.0209 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
triton_mm_131 0.0217 ms 96.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
triton_mm_124 0.0227 ms 92.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
triton_mm_129 0.0240 ms 87.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLO
# AOT ID: ['0_forward']
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
# AOT ID: ['0_forward']
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
(pytorch) [shunting@devgpu005.nha1 ~/ws/pytorch (dash)]$ git log --oneline a448b3ae9537c0ae233fb9199a4a221fdffbb..0e6c204642a571d5a7cd60be0caeb9b50faca030 torch/_inductor/
ffc202a1b91 Added remove_noop_ops to joint_graph_passes (#124451)
8a45cf4c64c [AOTI] align data_size of the constants (#127610)
6e5c2a1a3bc [inductor] Add missing files to torch_key (#128230)
647815049ec Inductor: Allow small sizes of m for mixed mm autotuning (#127663)
ba81c3c2909 [inductor] add cpp builder code. (take 2) (#125849)
0a6df4fca67 delete inductor config.trace.compile_profile (#127143)
0c7f4353e50 [inductor] simplify indexing (#127661)
d9696ea6248 [AOTInductor] [Tooling] Update NaN and INF Checker for AOTInductor (#127574)
852b7b4c995 [inductor] Enable subprocess-based parallel compile as the default (#126817)
diff --git a/torch/_inductor/fx_passes/joint_graph.py b/torch/_inductor/fx_passes/joint_graph.py
index ad134decd22..1d7bf8dc203 100644
--- a/torch/_inductor/fx_passes/joint_graph.py
+++ b/torch/_inductor/fx_passes/joint_graph.py
@@ -320,7 +320,7 @@ def joint_graph_passes(graph: torch.fx.GraphModule):
from .post_grad import remove_noop_ops
- remove_noop_ops(graph.graph)
+ # remove_noop_ops(graph.graph)
(pytorch) [shunting@devgpu005.nha1 ~/ws/pytorch (dash)]$ time python benchmarks/dynamo/torchbench.py --accuracy --inference --bfloat16 --backend inductor --device cuda --only moco --disable-cudagraphs
loading model: 0it [00:00, ?it/s]NCCL version 2.18.5+cuda12.0
loading model: 0it [00:04, ?it/s]
cuda eval moco
[rank0]:W0701 22:41:14.175000 139849568687104 torch/_logging/_internal.py:1040] [0/0] Profiler function <class 'torch.autograd.profiler.record_function'> will be ignored
ERROR:common:
Traceback (most recent call last):
File "/home/shunting/ws/pytorch/benchmarks/dynamo/common.py", line 2636, in check_accuracy
new_result = optimized_model_iter_fn(model_copy, example_inputs)
File "/home/shunting/ws/pytorch/torch/_dynamo/eval_frame.py", line 434, in _fn