Skip to content

Instantly share code, notes, and snippets.

View jamesjwu's full-sized avatar

James Wu jamesjwu

  • United States
  • 06:09 (UTC -04:00)
View GitHub Profile
loading model: 0it [00:00, ?it/s]
loading model: 0it [00:03, ?it/s]
ERROR:common:Backend dynamo failed in warmup()
Traceback (most recent call last):
File "/data/users/jjwu/a/pytorch/benchmarks/dynamo/common.py", line 2561, in warmup
fn(model, example_inputs)
File "/data/users/jjwu/a/pytorch/torch/_dynamo/eval_frame.py", line 804, in compile_wrapper
return fn(*args, **kwargs)
File "/data/users/jjwu/a/pytorch/benchmarks/dynamo/torchbench.py", line 469, in forward_and_backward_pass
import time
import torch
import triton
from torch._dynamo.device_interface import get_interface_for_device
from torch._inductor.runtime.static_cuda_launcher import StaticallyLaunchedCudaKernel
import sys
import os
@jamesjwu
jamesjwu / gist:7fe2723f3803ded2c2e81ba072fd16da
Created June 9, 2025 21:12
Triton benchmark launch overhead
import time
import torch
import torch._inductor.config as config
from torch import Tensor
from torch._dynamo.device_interface import get_interface_for_device
from torch._inductor.runtime.static_cuda_launcher import StaticallyLaunchedCudaKernel
from torch._inductor.runtime.triton_compat import tl, triton
# Constants
{
"cache_state": "miss",
"cache_status_detailed": "guard_miss",
"cache_status_guard_expr": "L['t1']*L['t0'] < 2147483648 and L['t2']*L['t3'] < 2147483648 and 2 <= L['t0'] and 2 <= L['t1'] and 2 <= L['t2'] and 2 <= L['t3']",
"components": [
"[f5mdxkfzriesfvzu264hheibpxmhcat7d3an75ymc4edjb5bagj] aot_config: (0, True, False, False, False, [TensorPropertySource(base=LocalSource(local_name='x', is_input=True, dynamism=None, is_derefed_cell_contents=False), prop=<TensorProperty.SIZE: 0>, idx=0), TensorPropertySource(base=LocalSource(local_name='x', is_input=True, dynamism=None, is_derefed_cell_contents=False), prop=<TensorProperty.SIZE: 0>, idx=1), LocalSource(local_name='x', is_input=True, dynamism=None, is_derefed_cell_contents=False), TensorPropertySource(base=LocalSource(local_name='y', is_input=True, dynamism=None, is_derefed_cell_contents=False), prop=<TensorProperty.SIZE: 0>, idx=0), TensorPropertySource(base=LocalSource(local_name='y', is_input=True, dynamism=None, is_derefed_cell_contents=False
@jamesjwu
jamesjwu / gist:e8678af012a4f7bf3ccc5449e8725d5e
Created March 27, 2025 16:49
Runtime compilation metric example
{
"compilation_metrics_runtime": {
"compile_id": "0/0",
"frame_key": null,
"co_name": null,
"co_filename": null,
"co_firstlineno": null,
"cache_size": null,
"accumulated_cache_size": null,
"guard_count": null,
import torch
import time
import torch._inductor.config as config
from torch import Tensor
from torch._inductor.runtime.triton_compat import tl, triton
sp = time.time()
N = 100
template = """
def nop_kernel_{i}(arg0, arg1):
Triton compilation failed: triton_poi_fused_0
defb844cc147', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': False, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
min_elem_per_thread=0
)
@triton.jit
def triton_(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr):
ynumel = 192
xnumel = 49
yoffset = tl.program_id(1) * (tl.program_id(2) + 1) * YBLOCK
yindex = yoffset + tl.arange(0, YBLOCK)[None, :]
# If not running interactively, don't do anything
case $- in
*i*) ;;
*) return;;
esac
# don't put duplicate lines or lines starting with space in the history.
# See bash(1) for more options
HISTCONTROL=ignoreboth
# If not running interactively, don't do anything
case $- in
*i*) ;;
*) return;;
esac
# don't put duplicate lines or lines starting with space in the history.
# See bash(1) for more options
HISTCONTROL=ignoreboth