Skip to content

Instantly share code, notes, and snippets.

`--> python benchmarks/dynamo/torchbench.py --accuracy --float32 -d cpu -n10 --training --only resnet50_quantized_qat --cold-start-latency --backend=aot_eager
loading model: 0it [00:13, ?it/s]
cpu train resnet50_quantized_qat
WARNING:common:fp64 golden ref were not generated for resnet50_quantized_qat. Setting accuracy check to cosine
WARNING:common:Trying to call the empty_gpu_cache for device: cpu, which is not in list [cuda, xpu]
WARNING:common:Trying to call the empty_gpu_cache for device: cpu, which is not in list [cuda, xpu]
WARNING:common:Trying to call the empty_gpu_cache for device: cpu, which is not in list [cuda, xpu]
ERROR:common:
Traceback (most recent call last):
File "/data/users/mlazos/pytorch/benchmarks/dynamo/common.py", line 2326, in check_accuracy
linear_compiled = torch.compile(linear, backend="inductor")
with config.patch(
{
"max_autotune": True,
"max_autotune_gemm_backends": "TRITON",
"cuda.cutlass_max_profiling_configs": 2,
"fx_graph_cache": False,
"fx_graph_remote_cache": False,
"cuda.enable_caching_codegen": True,
}
def test_tuple_return(self):
def inner_fn(x, y):
x0 = x + x + 1
y0 = y + y + 1
return x0, y0
def fn(x0, x1, x2, y0, y1, y2):
x0 = inner_fn(x0, y0)
x1 = inner_fn(x1, y1)
x2 = inner_fn(x2, y2)
import torch
import triton
import triton.language as tl
import helion
import helion.language as hl
@helion.kernel
def layer_norm_fwd(
x: torch.Tensor,
nomralized_shape: list[int],
best = self._autotune()
^^^^^^^^^^^^^^^^
File "/data/users/mlazos/helion/helion/autotuner/differential_evolution.py", line 97, in _autotune
self.initial_two_generations()
File "/data/users/mlazos/helion/helion/autotuner/differential_evolution.py", line 59, in initial_two_generations
self.parallel_benchmark_flat(
File "/data/users/mlazos/helion/helion/autotuner/base_search.py", line 359, in parallel_benchmark_flat
to_check, configs, self.parallel_benchmark(configs), strict=True
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/users/mlazos/helion/helion/autotuner/base_search.py", line 227, in parallel_benchmark
import torch
import triton
import triton.language as tl
import helion
import helion.language as hl
@helion.kernel
def layer_norm_fwd(
x: torch.Tensor,
nomralized_shape: list[int],
`--> python helion_welford.py
Traceback (most recent call last):
File "/data/users/mlazos/test/helion_welford.py", line 53, in <module>
print(layer_norm_fwd(torch.ones(2, 2, device="cuda"), [2], torch.ones(2, device="cuda"), torch.ones(2, device="cuda")))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/users/mlazos/helion/helion/runtime/kernel.py", line 272, in __call__
return self.bind(args)(*args)
^^^^^^^^^^^^^^^
File "/data/users/mlazos/helion/helion/runtime/kernel.py", line 156, in bind
bound_kernel = self.bind(normalized_args)
import torch
from torch import tensor, device
import torch.fx as fx
from torch._dynamo.testing import rand_strided
from math import inf
import torch._inductor.inductor_prims
torch.backends.cuda.matmul.fp32_precision = 'tf32'
torch.backends.cuda.matmul.allow_tf32 = True
import torch
import triton
import triton.language as tl
import helion
import helion.language as hl
@helion.kernel
def layer_norm_fwd(
x: torch.Tensor,
nomralized_shape: list[int],
`--> python benchmarks/dynamo/torchbench.py --accuracy --float32 -d cpu -n10 --training --only resnet50_quantized_qat --cold-start-latency --backend=inductor
loading model: 0it [00:22, ?it/s]
cpu train resnet50_quantized_qat
WARNING:common:fp64 golden ref were not generated for resnet50_quantized_qat. Setting accuracy check to cosine
WARNING:common:Trying to call the empty_gpu_cache for device: cpu, which is not in list [cuda, xpu]
WARNING:common:Trying to call the empty_gpu_cache for device: cpu, which is not in list [cuda, xpu]
WARNING:common:Trying to call the empty_gpu_cache for device: cpu, which is not in list [cuda, xpu]
ERROR:common:
Traceback (most recent call last):
File "/data/users/mlazos/pytorch/benchmarks/dynamo/common.py", line 2236, in check_accuracy