Michael Lazos mlazos

## gist:aa4e0515eba84a2b9c5792a8632a9aeb
`--> python benchmarks/dynamo/torchbench.py --accuracy --float32 -d cpu -n10 --training --only resnet50_quantized_qat --cold-start-latency --backend=aot_eager
loading model: 0it [00:13, ?it/s]
cpu  train resnet50_quantized_qat
WARNING:common:fp64 golden ref were not generated for resnet50_quantized_qat. Setting accuracy check to cosine
WARNING:common:Trying to call the empty_gpu_cache for device: cpu, which is not in list [cuda, xpu]
WARNING:common:Trying to call the empty_gpu_cache for device: cpu, which is not in list [cuda, xpu]
WARNING:common:Trying to call the empty_gpu_cache for device: cpu, which is not in list [cuda, xpu]
ERROR:common:
Traceback (most recent call last):
  File "/data/users/mlazos/pytorch/benchmarks/dynamo/common.py", line 2326, in check_accuracy

## gist:c4961a8bed1d0185f8400445800c434d
linear_compiled = torch.compile(linear, backend="inductor")
        with config.patch(
            {
                "max_autotune": True,
                "max_autotune_gemm_backends": "TRITON",
                "cuda.cutlass_max_profiling_configs": 2,
                "fx_graph_cache": False,
                "fx_graph_remote_cache": False,
                "cuda.enable_caching_codegen": True,
            }

## gist:43d2c5952cf7aac827d4665e29df2c3b
def test_tuple_return(self):
        def inner_fn(x, y):
            x0 = x + x + 1
            y0 = y + y + 1
            return x0, y0

        def fn(x0, x1, x2, y0, y1, y2):
            x0 = inner_fn(x0, y0)
            x1 = inner_fn(x1, y1)
            x2 = inner_fn(x2, y2)

## gist:7286727e6e987ad3e2586cef19889c98
import torch
import triton
import triton.language as tl
import helion
import helion.language as hl

@helion.kernel
def layer_norm_fwd(
    x: torch.Tensor,
    nomralized_shape: list[int],

## gist:2a2bea537e728eebee84ec76951dd65f
    best = self._autotune()
           ^^^^^^^^^^^^^^^^
  File "/data/users/mlazos/helion/helion/autotuner/differential_evolution.py", line 97, in _autotune
    self.initial_two_generations()
  File "/data/users/mlazos/helion/helion/autotuner/differential_evolution.py", line 59, in initial_two_generations
    self.parallel_benchmark_flat(
  File "/data/users/mlazos/helion/helion/autotuner/base_search.py", line 359, in parallel_benchmark_flat
    to_check, configs, self.parallel_benchmark(configs), strict=True
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/data/users/mlazos/helion/helion/autotuner/base_search.py", line 227, in parallel_benchmark

## gist:2ab1816e7ae55218601508af7d19a928
import torch
import triton
import triton.language as tl
import helion
import helion.language as hl

@helion.kernel
def layer_norm_fwd(
    x: torch.Tensor,
    nomralized_shape: list[int],

## gist:2d90b18d3eeb03a3a2e31c45de2ce7a5
`--> python helion_welford.py
Traceback (most recent call last):
  File "/data/users/mlazos/test/helion_welford.py", line 53, in <module>
    print(layer_norm_fwd(torch.ones(2, 2, device="cuda"), [2], torch.ones(2, device="cuda"), torch.ones(2, device="cuda")))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/data/users/mlazos/helion/helion/runtime/kernel.py", line 272, in __call__
    return self.bind(args)(*args)
           ^^^^^^^^^^^^^^^
  File "/data/users/mlazos/helion/helion/runtime/kernel.py", line 156, in bind
    bound_kernel = self.bind(normalized_args)

## gist:f168e0307013902e94dc5689d4f5659b
import torch
from torch import tensor, device
import torch.fx as fx
from torch._dynamo.testing import rand_strided
from math import inf
import torch._inductor.inductor_prims

torch.backends.cuda.matmul.fp32_precision = 'tf32'
torch.backends.cuda.matmul.allow_tf32 = True

## sym_error_helion
import torch
import triton
import triton.language as tl
import helion
import helion.language as hl

@helion.kernel
def layer_norm_fwd(
    x: torch.Tensor,
    nomralized_shape: list[int],

## gist:ee082bbe608ac7a6d885d91388830a58
`--> python benchmarks/dynamo/torchbench.py --accuracy --float32 -d cpu -n10 --training --only resnet50_quantized_qat --cold-start-latency --backend=inductor
loading model: 0it [00:22, ?it/s]
cpu  train resnet50_quantized_qat
WARNING:common:fp64 golden ref were not generated for resnet50_quantized_qat. Setting accuracy check to cosine
WARNING:common:Trying to call the empty_gpu_cache for device: cpu, which is not in list [cuda, xpu]
WARNING:common:Trying to call the empty_gpu_cache for device: cpu, which is not in list [cuda, xpu]
WARNING:common:Trying to call the empty_gpu_cache for device: cpu, which is not in list [cuda, xpu]
ERROR:common:
Traceback (most recent call last):
  File "/data/users/mlazos/pytorch/benchmarks/dynamo/common.py", line 2236, in check_accuracy
	`--> python benchmarks/dynamo/torchbench.py --accuracy --float32 -d cpu -n10 --training --only resnet50_quantized_qat --cold-start-latency --backend=aot_eager
	loading model: 0it [00:13, ?it/s]
	cpu train resnet50_quantized_qat
	WARNING:common:fp64 golden ref were not generated for resnet50_quantized_qat. Setting accuracy check to cosine
	WARNING:common:Trying to call the empty_gpu_cache for device: cpu, which is not in list [cuda, xpu]
	WARNING:common:Trying to call the empty_gpu_cache for device: cpu, which is not in list [cuda, xpu]
	WARNING:common:Trying to call the empty_gpu_cache for device: cpu, which is not in list [cuda, xpu]
	ERROR:common:
	Traceback (most recent call last):
	File "/data/users/mlazos/pytorch/benchmarks/dynamo/common.py", line 2326, in check_accuracy
	linear_compiled = torch.compile(linear, backend="inductor")
	with config.patch(
	{
	"max_autotune": True,
	"max_autotune_gemm_backends": "TRITON",
	"cuda.cutlass_max_profiling_configs": 2,
	"fx_graph_cache": False,
	"fx_graph_remote_cache": False,
	"cuda.enable_caching_codegen": True,
	}
	def test_tuple_return(self):
	def inner_fn(x, y):
	x0 = x + x + 1
	y0 = y + y + 1
	return x0, y0

	def fn(x0, x1, x2, y0, y1, y2):
	x0 = inner_fn(x0, y0)
	x1 = inner_fn(x1, y1)
	x2 = inner_fn(x2, y2)
	import torch
	import triton
	import triton.language as tl
	import helion
	import helion.language as hl

	@helion.kernel
	def layer_norm_fwd(
	x: torch.Tensor,
	nomralized_shape: list[int],
	best = self._autotune()
	^^^^^^^^^^^^^^^^
	File "/data/users/mlazos/helion/helion/autotuner/differential_evolution.py", line 97, in _autotune
	self.initial_two_generations()
	File "/data/users/mlazos/helion/helion/autotuner/differential_evolution.py", line 59, in initial_two_generations
	self.parallel_benchmark_flat(
	File "/data/users/mlazos/helion/helion/autotuner/base_search.py", line 359, in parallel_benchmark_flat
	to_check, configs, self.parallel_benchmark(configs), strict=True
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	File "/data/users/mlazos/helion/helion/autotuner/base_search.py", line 227, in parallel_benchmark
	`--> python helion_welford.py
	Traceback (most recent call last):
	File "/data/users/mlazos/test/helion_welford.py", line 53, in <module>
	print(layer_norm_fwd(torch.ones(2, 2, device="cuda"), [2], torch.ones(2, device="cuda"), torch.ones(2, device="cuda")))
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	File "/data/users/mlazos/helion/helion/runtime/kernel.py", line 272, in __call__
	return self.bind(args)(*args)
	^^^^^^^^^^^^^^^
	File "/data/users/mlazos/helion/helion/runtime/kernel.py", line 156, in bind
	bound_kernel = self.bind(normalized_args)
	import torch
	from torch import tensor, device
	import torch.fx as fx
	from torch._dynamo.testing import rand_strided
	from math import inf
	import torch._inductor.inductor_prims

	torch.backends.cuda.matmul.fp32_precision = 'tf32'
	torch.backends.cuda.matmul.allow_tf32 = True
	`--> python benchmarks/dynamo/torchbench.py --accuracy --float32 -d cpu -n10 --training --only resnet50_quantized_qat --cold-start-latency --backend=inductor
	loading model: 0it [00:22, ?it/s]
	cpu train resnet50_quantized_qat
	WARNING:common:fp64 golden ref were not generated for resnet50_quantized_qat. Setting accuracy check to cosine
	WARNING:common:Trying to call the empty_gpu_cache for device: cpu, which is not in list [cuda, xpu]
	WARNING:common:Trying to call the empty_gpu_cache for device: cpu, which is not in list [cuda, xpu]
	WARNING:common:Trying to call the empty_gpu_cache for device: cpu, which is not in list [cuda, xpu]
	ERROR:common:
	Traceback (most recent call last):
	File "/data/users/mlazos/pytorch/benchmarks/dynamo/common.py", line 2236, in check_accuracy