Created
April 10, 2023 20:19
-
-
Save shunting314/c2a4d8a28b00fcb5586d0e9d9bf77f9f to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from ctypes import c_void_p, c_long | |
| import torch | |
| import math | |
| import random | |
| import os | |
| import tempfile | |
| from torch._inductor.utils import maybe_profile | |
| from torch import empty_strided, as_strided, device | |
| from torch._inductor.codecache import AsyncCompile | |
| from torch._inductor.select_algorithm import extern_kernels | |
| aten = torch.ops.aten | |
| assert_size_stride = torch._C._dynamo.guards.assert_size_stride | |
| async_compile = AsyncCompile() | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.triton_heuristics import grid, start_graph, end_graph | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| # kernel path: /tmp/torchinductor_shunting/cd/ccd7accawcy3jndrehd4geosuzoh6rxkzjlqnojxb5msgrcv7udi.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type | |
| triton_poi_fused__to_copy_0 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 864 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((32, 3, 3, 3), (27, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((32, 3, 3, 3), (27, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_0.run(*args, 864, grid=grid(864), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_0.benchmark_all_configs(*args, 864, grid=grid(864)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/4z/c4zfgpaaduweifmtujob2ufcjjkrkqa7mb5j2qaaysmstief35xb.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_1 | |
| triton_poi_fused__to_copy_1 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_1(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 19267584 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 3, 224, 224), (150528, 50176, 224, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((128, 3, 224, 224), (150528, 50176, 224, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_1.run(*args, 19267584, grid=grid(19267584), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_1.benchmark_all_configs(*args, 19267584, grid=grid(19267584)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/4v/c4vmg7ssybcn2qvzgu3tqbkt62uopduyelvkcwxaqv2fc4glkb57.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => convert_element_type_2, var_mean | |
| triton_red_fused__native_batch_norm_legit_functional_2 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[512, 131072], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_2(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 448 | |
| rnumel = 114688 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex % 32 | |
| x1 = (xindex // 32) | |
| _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| x3 = xindex | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| tmp0 = tl.load(in_ptr0 + ((112*(((r2 + (114688*x1)) // 112) % 112)) + (12544*x0) + (401408*((r2 + (114688*x1)) // 12544)) + (r2 % 112)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
| tmp2 = tl.sum(_tmp2, 1)[:, None] | |
| tl.store(out_ptr0 + x3, tmp2, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 32, 1, 1, 14), (448, 1, 448, 448, 32), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_2.run(*args, 448, 114688, grid=grid(448), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_2.benchmark_all_configs(*args, 448, 114688, grid=grid(448)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ax/caxxuenolffrdcs4fes5eiayfgrbnuyzkml277uyriuhejqsq7k4.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => add_2, convert_element_type_2, mul_1, mul_2, var_mean | |
| triton_per_fused__native_batch_norm_legit_functional_3 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[32, 16], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__native_batch_norm_legit_functional_3(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 32 | |
| rnumel = 14 | |
| RBLOCK: tl.constexpr = 16 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (32*r1)), rmask & xmask, other=0) | |
| tmp8 = tl.load(in_ptr1 + (x0), xmask) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp4 = 1605632.0 | |
| tmp5 = tmp3 / tmp4 | |
| tmp6 = 0.1 | |
| tmp7 = tmp5 * tmp6 | |
| tmp9 = 0.9 | |
| tmp10 = tmp8 * tmp9 | |
| tmp11 = tmp7 + tmp10 | |
| tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((1, 32, 1, 1, 14), (448, 1, 448, 448, 32), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__native_batch_norm_legit_functional_3.run(*args, 32, 14, grid=grid(32), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__native_batch_norm_legit_functional_3.benchmark_all_configs(*args, 32, 14, grid=grid(32)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/4s/c4sugoluux54xogsh3dgakggor26tpyllwcenrcu3hdxyyol5rhg.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => convert_element_type_2, var_mean | |
| triton_red_fused__native_batch_norm_legit_functional_4 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[512, 131072], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_4(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 448 | |
| rnumel = 114688 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex % 32 | |
| x1 = (xindex // 32) | |
| tmp2 = tl.load(in_ptr1 + (x0), xmask) | |
| _tmp5 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| x3 = xindex | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| tmp0 = tl.load(in_ptr0 + ((112*(((r2 + (114688*x1)) // 112) % 112)) + (12544*x0) + (401408*((r2 + (114688*x1)) // 12544)) + (r2 % 112)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp4 = tmp3 * tmp3 | |
| _tmp5 = tl.where(rmask & xmask, _tmp5 + tmp4, _tmp5) | |
| tmp5 = tl.sum(_tmp5, 1)[:, None] | |
| tl.store(out_ptr0 + x3, tmp5, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 32, 1, 1, 14), (448, 1, 448, 448, 32), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_4.run(*args, 448, 114688, grid=grid(448), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_4.benchmark_all_configs(*args, 448, 114688, grid=grid(448)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/yz/cyzfzxq5m7mzh26evqlnxbs2f4624mxxzlbzx5v335w7yujg6hp5.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => add_1, add_3, convert_element_type_2, mul_3, mul_4, mul_5, rsqrt, squeeze_1, var_mean | |
| triton_per_fused__native_batch_norm_legit_functional_5 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[32, 16], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__native_batch_norm_legit_functional_5(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 32 | |
| rnumel = 14 | |
| RBLOCK: tl.constexpr = 16 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (32*r1)), rmask & xmask, other=0) | |
| tmp13 = tl.load(in_ptr1 + (x0), xmask) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp4 = 1605632.0 | |
| tmp5 = tmp3 / tmp4 | |
| tmp6 = 1e-05 | |
| tmp7 = tmp5 + tmp6 | |
| tmp8 = tl.math.rsqrt(tmp7) | |
| tmp9 = 1.0000006228081046 | |
| tmp10 = tmp5 * tmp9 | |
| tmp11 = 0.1 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = 0.9 | |
| tmp15 = tmp13 * tmp14 | |
| tmp16 = tmp12 + tmp15 | |
| tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask) | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask) | |
| tl.store(out_ptr0 + x0, tmp3, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((1, 32, 1, 1, 14), (448, 1, 448, 448, 32), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__native_batch_norm_legit_functional_5.run(*args, 32, 14, grid=grid(32), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__native_batch_norm_legit_functional_5.benchmark_all_configs(*args, 32, 14, grid=grid(32)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/tp/ctp3rh47jmesldsykk2bkq62s2n65nizu7jrip6s664llvntz6jk.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.relu | |
| # aten._native_batch_norm_legit_functional => add_1, add_4, convert_element_type_2, convert_element_type_3, mul, mul_6, rsqrt, sub, var_mean | |
| # aten.relu => relu | |
| triton_poi_fused__native_batch_norm_legit_functional_relu_6 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_relu_6(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 51380224 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 12544) % 32 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp2 = tl.load(in_ptr1 + (x1), None) | |
| tmp4 = tl.load(in_ptr2 + (x1), None) | |
| tmp11 = tl.load(in_ptr3 + (x1), None) | |
| tmp13 = tl.load(in_ptr4 + (x1), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp5 = 1605632.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = 1e-05 | |
| tmp8 = tmp6 + tmp7 | |
| tmp9 = tl.math.rsqrt(tmp8) | |
| tmp10 = tmp3 * tmp9 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = tmp12 + tmp13 | |
| tmp15 = tmp14.to(tl.float32) | |
| tmp16 = tl.where(0 != 0, 0, tl.where(0 > tmp15, 0, tmp15)) | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp16, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_relu_6.run(*args, 51380224, grid=grid(51380224), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_relu_6.benchmark_all_configs(*args, 51380224, grid=grid(51380224)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/bt/cbt6gouhluogn74n5bu4s6rx7uqiiw6lwhnqaggedwz4abkhco2b.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_4 | |
| triton_poi_fused__to_copy_7 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[512], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_7(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 288 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((32, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((32, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_7.run(*args, 288, grid=grid(288), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_7.benchmark_all_configs(*args, 288, grid=grid(288)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/tq/ctqzpnsuriwiat75pazkds4p5xe54pkhlftlkhx4o3jngunwgzub.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_7 | |
| triton_poi_fused__to_copy_8 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_8(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1024 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((32, 32, 1, 1), (32, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((32, 32, 1, 1), (32, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_8.run(*args, 1024, grid=grid(1024), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_8.benchmark_all_configs(*args, 1024, grid=grid(1024)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ma/cma4u56mzuidifjz56xk54ie6sgj6png7qekm2tx4cs7sle553zr.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add | |
| # aten._native_batch_norm_legit_functional => add_11, add_14, convert_element_type_8, convert_element_type_9, mul_14, mul_20, rsqrt_2, sub_2, var_mean_2 | |
| # aten.add => add_15 | |
| triton_poi_fused__native_batch_norm_legit_functional_add_9 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_add_9(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 51380224 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 12544) % 32 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp2 = tl.load(in_ptr1 + (x1), None) | |
| tmp4 = tl.load(in_ptr2 + (x1), None) | |
| tmp11 = tl.load(in_ptr3 + (x1), None) | |
| tmp13 = tl.load(in_ptr4 + (x1), None) | |
| tmp16 = tl.load(in_ptr5 + (x3), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp5 = 1605632.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = 1e-05 | |
| tmp8 = tmp6 + tmp7 | |
| tmp9 = tl.math.rsqrt(tmp8) | |
| tmp10 = tmp3 * tmp9 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = tmp12 + tmp13 | |
| tmp15 = tmp14.to(tl.float32) | |
| tmp17 = tmp15 + tmp16 | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp17, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
| arg_6 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_add_9.run(*args, 51380224, grid=grid(51380224), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_add_9.benchmark_all_configs(*args, 51380224, grid=grid(51380224)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/mf/cmfq2hg7iduys7x4z6irkzuujylw3e7gj4ysdrcbo5cemukrg3t5.py | |
| # Original ATen: aten.split_with_sizes | |
| # aten.split_with_sizes => getitem_6 | |
| triton_poi_fused_split_with_sizes_10 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_split_with_sizes_10(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 25690112 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 200704 | |
| x1 = (xindex // 200704) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (401408*x1)), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 16, 112, 112), (200704, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_split_with_sizes_10.run(*args, 25690112, grid=grid(25690112), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_split_with_sizes_10.benchmark_all_configs(*args, 25690112, grid=grid(25690112)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/lr/clrdqz4vueql2no5zrrsw4bgmmsgoov4oautycbtlc4q5ypfrpxe.py | |
| # Original ATen: aten.split_with_sizes | |
| # aten.split_with_sizes => getitem_7 | |
| triton_poi_fused_split_with_sizes_11 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_split_with_sizes_11(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 25690112 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 200704 | |
| x1 = (xindex // 200704) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (200704 + x0 + (401408*x1)), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 16, 112, 112), (200704, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_split_with_sizes_11.run(*args, 25690112, grid=grid(25690112), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_split_with_sizes_11.benchmark_all_configs(*args, 25690112, grid=grid(25690112)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/wk/cwkoecpvylkd2zd4p2qzrsxvdjpmlhjy2vqltsena3lm7tbpxnqv.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_10 | |
| triton_poi_fused__to_copy_12 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_12(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1536 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((96, 16, 1, 1), (16, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((96, 16, 1, 1), (16, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_12.run(*args, 1536, grid=grid(1536), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_12.benchmark_all_configs(*args, 1536, grid=grid(1536)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/yj/cyjbfwxbzmq4snbyqnu5ehgb3xenlxtbixoczmyjqmdjbuu22kqa.py | |
| # Original ATen: aten.cat | |
| # aten.cat => cat | |
| triton_poi_fused_cat_13 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[268435456], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_cat_13(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 154140672 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 1204224 | |
| x1 = (xindex // 1204224) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + (2408448*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 96, 112, 112), (1204224, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 96, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_cat_13.run(*args, 154140672, grid=grid(154140672), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_cat_13.benchmark_all_configs(*args, 154140672, grid=grid(154140672)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/nh/cnh3nbcj2c5nraypyukh7gkg7ocvsbjrgj7ok7wc472nt34wy65s.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => convert_element_type_12, var_mean_3 | |
| triton_red_fused__native_batch_norm_legit_functional_14 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[4096, 131072], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_14(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 2496 | |
| rnumel = 123511 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x1 = (xindex // 192) | |
| x0 = xindex % 192 | |
| _tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| x3 = xindex | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| tmp0 = r2 + (123511*x1) | |
| tmp1 = 1605632 | |
| tmp2 = tmp0 < tmp1 | |
| tmp3 = tl.load(in_ptr0 + ((12544*x0) + (2408448*(((r2 + (123511*x1)) // 12544) % 128)) + ((r2 + (123511*x1)) % 12544) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp4 = tmp3.to(tl.float32) | |
| tmp5 = tl.where(tmp2, tmp4, 0) | |
| _tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6) | |
| tmp6 = tl.sum(_tmp6, 1)[:, None] | |
| tl.store(out_ptr0 + x3, tmp6, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 192, 1, 1, 13), (2496, 1, 2496, 2496, 192), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_14.run(*args, 2496, 123511, grid=grid(2496), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_14.benchmark_all_configs(*args, 2496, 123511, grid=grid(2496)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ed/cedbp3dhcc6xvctcve537pwhmiwi2ebf5oi5f5bppm5whslbp34w.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => add_18, convert_element_type_12, mul_22, mul_23, var_mean_3 | |
| triton_per_fused__native_batch_norm_legit_functional_15 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[256, 16], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__native_batch_norm_legit_functional_15(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 192 | |
| rnumel = 13 | |
| RBLOCK: tl.constexpr = 16 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (192*r1)), rmask & xmask, other=0) | |
| tmp8 = tl.load(in_ptr1 + (x0), xmask) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp4 = 1605632.0 | |
| tmp5 = tmp3 / tmp4 | |
| tmp6 = 0.1 | |
| tmp7 = tmp5 * tmp6 | |
| tmp9 = 0.9 | |
| tmp10 = tmp8 * tmp9 | |
| tmp11 = tmp7 + tmp10 | |
| tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((1, 192, 1, 1, 13), (2496, 1, 2496, 2496, 192), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__native_batch_norm_legit_functional_15.run(*args, 192, 13, grid=grid(192), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__native_batch_norm_legit_functional_15.benchmark_all_configs(*args, 192, 13, grid=grid(192)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/jk/cjk2vm3446xrk7rth7hr6pun7xxo3dnzubwcn6ydrpifal4eykrz.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => convert_element_type_12, var_mean_3 | |
| triton_red_fused__native_batch_norm_legit_functional_16 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[4096, 131072], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_16(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 2496 | |
| rnumel = 123511 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x1 = (xindex // 192) | |
| x0 = xindex % 192 | |
| _tmp9 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| x3 = xindex | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| tmp0 = r2 + (123511*x1) | |
| tmp1 = 1605632 | |
| tmp2 = tmp0 < tmp1 | |
| tmp3 = tl.load(in_ptr0 + ((12544*x0) + (2408448*(((r2 + (123511*x1)) // 12544) % 128)) + ((r2 + (123511*x1)) % 12544) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp4 = tmp3.to(tl.float32) | |
| tmp5 = tl.load(in_ptr1 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0) | |
| tmp6 = tmp4 - tmp5 | |
| tmp7 = tmp6 * tmp6 | |
| tmp8 = tl.where(tmp2, tmp7, 0) | |
| _tmp9 = tl.where(rmask & xmask, _tmp9 + tmp8, _tmp9) | |
| tmp9 = tl.sum(_tmp9, 1)[:, None] | |
| tl.store(out_ptr0 + x3, tmp9, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 192, 1, 1, 13), (2496, 1, 2496, 2496, 192), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_16.run(*args, 2496, 123511, grid=grid(2496), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_16.benchmark_all_configs(*args, 2496, 123511, grid=grid(2496)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/r2/cr2l4iahxzbm4xda53lhc7nels7ttppa4wfso67ohv6lwu4ut2ei.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => add_17, add_19, convert_element_type_12, mul_24, mul_25, mul_26, rsqrt_3, squeeze_10, var_mean_3 | |
| triton_per_fused__native_batch_norm_legit_functional_17 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[256, 16], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__native_batch_norm_legit_functional_17(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 192 | |
| rnumel = 13 | |
| RBLOCK: tl.constexpr = 16 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (192*r1)), rmask & xmask, other=0) | |
| tmp13 = tl.load(in_ptr1 + (x0), xmask) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp4 = 1605632.0 | |
| tmp5 = tmp3 / tmp4 | |
| tmp6 = 1e-05 | |
| tmp7 = tmp5 + tmp6 | |
| tmp8 = tl.math.rsqrt(tmp7) | |
| tmp9 = 1.0000006228081046 | |
| tmp10 = tmp5 * tmp9 | |
| tmp11 = 0.1 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = 0.9 | |
| tmp15 = tmp13 * tmp14 | |
| tmp16 = tmp12 + tmp15 | |
| tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask) | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask) | |
| tl.store(out_ptr0 + x0, tmp3, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((1, 192, 1, 1, 13), (2496, 1, 2496, 2496, 192), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__native_batch_norm_legit_functional_17.run(*args, 192, 13, grid=grid(192), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__native_batch_norm_legit_functional_17.benchmark_all_configs(*args, 192, 13, grid=grid(192)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/7k/c7k35ubu25j3ryiaxnuo2fjojvhqvy67tox3wtaa4knc5l4vzqby.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.relu, aten.threshold_backward | |
| # aten._native_batch_norm_legit_functional => add_17, add_20, convert_element_type_12, convert_element_type_13, mul_21, mul_27, rsqrt_3, sub_3, var_mean_3 | |
| # aten.relu => relu_2 | |
| # aten.threshold_backward => le_4 | |
| triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_18 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[536870912], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*i1', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_18(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 308281344 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 12544) % 192 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp2 = tl.load(in_ptr1 + (x1), None) | |
| tmp4 = tl.load(in_ptr2 + (x1), None) | |
| tmp11 = tl.load(in_ptr3 + (x1), None) | |
| tmp13 = tl.load(in_ptr4 + (x1), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp5 = 1605632.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = 1e-05 | |
| tmp8 = tmp6 + tmp7 | |
| tmp9 = tl.math.rsqrt(tmp8) | |
| tmp10 = tmp3 * tmp9 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = tmp12 + tmp13 | |
| tmp15 = tmp14.to(tl.float32) | |
| tmp16 = tl.where(0 != 0, 0, tl.where(0 > tmp15, 0, tmp15)) | |
| tmp17 = 0.0 | |
| tmp18 = tmp16 <= tmp17 | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp16, None) | |
| tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp18, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
| arg_6 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.bool) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_18.run(*args, 308281344, grid=grid(308281344), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_18.benchmark_all_configs(*args, 308281344, grid=grid(308281344)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/db/cdbk64vwqs4pzrtrb762fxgfklb7q573zm3neisxuhpcneczshvk.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_14 | |
| triton_poi_fused__to_copy_19 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_19(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 576 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((64, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((64, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_19.run(*args, 576, grid=grid(576), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_19.benchmark_all_configs(*args, 576, grid=grid(576)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/fq/cfq5oxsb4jr7dbzocvxxr77nzkg2jk4k3rsmhwdpzkoisxiavav5.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_15 | |
| triton_poi_fused__to_copy_20 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_20(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1600 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((64, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((64, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_20.run(*args, 1600, grid=grid(1600), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_20.benchmark_all_configs(*args, 1600, grid=grid(1600)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/fo/cfoj5fwz53k7xkfcu6z676snb5mum4wtda3diyiuxkkhrnlissnf.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_16 | |
| triton_poi_fused__to_copy_21 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_21(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3136 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((64, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((64, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_21.run(*args, 3136, grid=grid(3136), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_21.benchmark_all_configs(*args, 3136, grid=grid(3136)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ye/cyesscl4aqq7rzyjhll5ppohr4h424gcitducugsmauai5wnvgy5.py | |
| # Original ATen: aten.cat | |
| # aten.cat => cat_1 | |
| triton_poi_fused_cat_22 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_cat_22(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 25690112 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 200704 | |
| x1 = (xindex // 200704) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + (602112*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 64, 56, 56), (200704, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 64, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_cat_22.run(*args, 25690112, grid=grid(25690112), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_cat_22.benchmark_all_configs(*args, 25690112, grid=grid(25690112)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/zm/czmjlzsknic276jj2ulhzekdx7suavkcesp75xqpnfbntqprzsrx.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => convert_element_type_17, var_mean_4 | |
| triton_red_fused__native_batch_norm_legit_functional_23 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[1024, 131072], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_23(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 768 | |
| rnumel = 100352 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex % 192 | |
| x1 = (xindex // 192) | |
| _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| x3 = xindex | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| tmp0 = tl.load(in_ptr0 + ((3136*x0) + (602112*(r2 // 3136)) + (19267584*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
| tmp2 = tl.sum(_tmp2, 1)[:, None] | |
| tl.store(out_ptr0 + x3, tmp2, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 192, 1, 1, 4), (768, 1, 768, 768, 192), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_23.run(*args, 768, 100352, grid=grid(768), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_23.benchmark_all_configs(*args, 768, 100352, grid=grid(768)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/zp/czpkjhwvqvlhuxsldhiwsf77gmyfxqbtysyyhp7lberdnzehm42l.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => add_23, convert_element_type_17, mul_29, mul_30, var_mean_4 | |
| triton_per_fused__native_batch_norm_legit_functional_24 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[256, 4], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__native_batch_norm_legit_functional_24(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 192 | |
| rnumel = 4 | |
| RBLOCK: tl.constexpr = 4 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (192*r1)), rmask & xmask, other=0) | |
| tmp8 = tl.load(in_ptr1 + (x0), xmask) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp4 = 401408.0 | |
| tmp5 = tmp3 / tmp4 | |
| tmp6 = 0.1 | |
| tmp7 = tmp5 * tmp6 | |
| tmp9 = 0.9 | |
| tmp10 = tmp8 * tmp9 | |
| tmp11 = tmp7 + tmp10 | |
| tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((1, 192, 1, 1, 4), (768, 1, 768, 768, 192), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__native_batch_norm_legit_functional_24.run(*args, 192, 4, grid=grid(192), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__native_batch_norm_legit_functional_24.benchmark_all_configs(*args, 192, 4, grid=grid(192)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/4o/c4ohubk3q7bkigrpxxijrzzlhs5bqi6qcn3fxomltgipjmowqgd2.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => convert_element_type_17, var_mean_4 | |
| triton_red_fused__native_batch_norm_legit_functional_25 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[1024, 131072], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_25(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 768 | |
| rnumel = 100352 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex % 192 | |
| x1 = (xindex // 192) | |
| tmp2 = tl.load(in_ptr1 + (x0), xmask) | |
| _tmp5 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| x3 = xindex | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| tmp0 = tl.load(in_ptr0 + ((3136*x0) + (602112*(r2 // 3136)) + (19267584*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp4 = tmp3 * tmp3 | |
| _tmp5 = tl.where(rmask & xmask, _tmp5 + tmp4, _tmp5) | |
| tmp5 = tl.sum(_tmp5, 1)[:, None] | |
| tl.store(out_ptr0 + x3, tmp5, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 192, 1, 1, 4), (768, 1, 768, 768, 192), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_25.run(*args, 768, 100352, grid=grid(768), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_25.benchmark_all_configs(*args, 768, 100352, grid=grid(768)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/kw/ckworjll2g6m5sbalty5hh27jhwuiyj3iqglt67oatdxz54d3igy.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => add_22, add_24, convert_element_type_17, mul_31, mul_32, mul_33, rsqrt_4, squeeze_13, var_mean_4 | |
| triton_per_fused__native_batch_norm_legit_functional_26 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[256, 4], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__native_batch_norm_legit_functional_26(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 192 | |
| rnumel = 4 | |
| RBLOCK: tl.constexpr = 4 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (192*r1)), rmask & xmask, other=0) | |
| tmp13 = tl.load(in_ptr1 + (x0), xmask) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp4 = 401408.0 | |
| tmp5 = tmp3 / tmp4 | |
| tmp6 = 1e-05 | |
| tmp7 = tmp5 + tmp6 | |
| tmp8 = tl.math.rsqrt(tmp7) | |
| tmp9 = 1.0000024912370735 | |
| tmp10 = tmp5 * tmp9 | |
| tmp11 = 0.1 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = 0.9 | |
| tmp15 = tmp13 * tmp14 | |
| tmp16 = tmp12 + tmp15 | |
| tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask) | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask) | |
| tl.store(out_ptr0 + x0, tmp3, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((1, 192, 1, 1, 4), (768, 1, 768, 768, 192), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__native_batch_norm_legit_functional_26.run(*args, 192, 4, grid=grid(192), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__native_batch_norm_legit_functional_26.benchmark_all_configs(*args, 192, 4, grid=grid(192)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/rb/crbx6irdjcatfhmyhdhxdmxoun65x4lirw3usotlxxknzb3rnzo3.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.relu, aten.threshold_backward | |
| # aten._native_batch_norm_legit_functional => add_22, add_25, convert_element_type_17, convert_element_type_18, mul_28, mul_34, rsqrt_4, sub_4, var_mean_4 | |
| # aten.relu => relu_3 | |
| # aten.threshold_backward => le_3 | |
| triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_27 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[134217728], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*i1', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_27(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 77070336 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 3136) % 192 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp2 = tl.load(in_ptr1 + (x1), None) | |
| tmp4 = tl.load(in_ptr2 + (x1), None) | |
| tmp11 = tl.load(in_ptr3 + (x1), None) | |
| tmp13 = tl.load(in_ptr4 + (x1), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp5 = 401408.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = 1e-05 | |
| tmp8 = tmp6 + tmp7 | |
| tmp9 = tl.math.rsqrt(tmp8) | |
| tmp10 = tmp3 * tmp9 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = tmp12 + tmp13 | |
| tmp15 = tmp14.to(tl.float32) | |
| tmp16 = tl.where(0 != 0, 0, tl.where(0 > tmp15, 0, tmp15)) | |
| tmp17 = 0.0 | |
| tmp18 = tmp16 <= tmp17 | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp16, None) | |
| tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp18, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| arg_6 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.bool) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_27.run(*args, 77070336, grid=grid(77070336), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_27.benchmark_all_configs(*args, 77070336, grid=grid(77070336)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/yr/cyrrsvkikbwrkovp7p3kp54xrx3kv5k3cuej452rtgigaetgnkzf.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_19 | |
| triton_poi_fused__to_copy_28 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_28(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1920 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((20, 96, 1, 1), (96, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((20, 96, 1, 1), (96, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_28.run(*args, 1920, grid=grid(1920), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_28.benchmark_all_configs(*args, 1920, grid=grid(1920)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/lx/clxgqhjnos6ucboracnh3kmrmn5mwuc3sgkffihrftfhuwl64wg2.py | |
| # Original ATen: aten.cat | |
| # aten.cat => cat_2 | |
| triton_poi_fused_cat_29 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_cat_29(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 8028160 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 62720 | |
| x1 = (xindex // 62720) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + (125440*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 20, 56, 56), (62720, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 20, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_cat_29.run(*args, 8028160, grid=grid(8028160), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_cat_29.benchmark_all_configs(*args, 8028160, grid=grid(8028160)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/e3/ce3edr4giy7ygyubb445dsh6wavenlh7yhdxr5odchwzb5blm45m.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => convert_element_type_21, var_mean_5 | |
| triton_red_fused__native_batch_norm_legit_functional_30 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[1024, 32768], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_30(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 520 | |
| rnumel = 30878 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x1 = (xindex // 40) | |
| x0 = xindex % 40 | |
| _tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| x3 = xindex | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| tmp0 = r2 + (30878*x1) | |
| tmp1 = 401408 | |
| tmp2 = tmp0 < tmp1 | |
| tmp3 = tl.load(in_ptr0 + ((3136*x0) + (125440*(((r2 + (30878*x1)) // 3136) % 128)) + ((r2 + (30878*x1)) % 3136) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp4 = tmp3.to(tl.float32) | |
| tmp5 = tl.where(tmp2, tmp4, 0) | |
| _tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6) | |
| tmp6 = tl.sum(_tmp6, 1)[:, None] | |
| tl.store(out_ptr0 + x3, tmp6, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 40, 1, 1, 13), (520, 1, 520, 520, 40), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_30.run(*args, 520, 30878, grid=grid(520), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_30.benchmark_all_configs(*args, 520, 30878, grid=grid(520)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/5l/c5ldagjb25qlt4w6hnm2s5f7x7ddcw2evdosdg3pvhtpoduxelhs.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => add_28, convert_element_type_21, mul_36, mul_37, var_mean_5 | |
| triton_per_fused__native_batch_norm_legit_functional_31 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[64, 16], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__native_batch_norm_legit_functional_31(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 40 | |
| rnumel = 13 | |
| RBLOCK: tl.constexpr = 16 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (40*r1)), rmask & xmask, other=0) | |
| tmp8 = tl.load(in_ptr1 + (x0), xmask) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp4 = 401408.0 | |
| tmp5 = tmp3 / tmp4 | |
| tmp6 = 0.1 | |
| tmp7 = tmp5 * tmp6 | |
| tmp9 = 0.9 | |
| tmp10 = tmp8 * tmp9 | |
| tmp11 = tmp7 + tmp10 | |
| tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((1, 40, 1, 1, 13), (520, 1, 520, 520, 40), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__native_batch_norm_legit_functional_31.run(*args, 40, 13, grid=grid(40), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__native_batch_norm_legit_functional_31.benchmark_all_configs(*args, 40, 13, grid=grid(40)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ko/ckomyuegfrpkii72qwrquhfy6eh53az7jeqam3ghl4ajcmcw2h3l.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => convert_element_type_21, var_mean_5 | |
| triton_red_fused__native_batch_norm_legit_functional_32 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[1024, 32768], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_32(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 520 | |
| rnumel = 30878 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x1 = (xindex // 40) | |
| x0 = xindex % 40 | |
| _tmp9 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| x3 = xindex | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| tmp0 = r2 + (30878*x1) | |
| tmp1 = 401408 | |
| tmp2 = tmp0 < tmp1 | |
| tmp3 = tl.load(in_ptr0 + ((3136*x0) + (125440*(((r2 + (30878*x1)) // 3136) % 128)) + ((r2 + (30878*x1)) % 3136) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp4 = tmp3.to(tl.float32) | |
| tmp5 = tl.load(in_ptr1 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0) | |
| tmp6 = tmp4 - tmp5 | |
| tmp7 = tmp6 * tmp6 | |
| tmp8 = tl.where(tmp2, tmp7, 0) | |
| _tmp9 = tl.where(rmask & xmask, _tmp9 + tmp8, _tmp9) | |
| tmp9 = tl.sum(_tmp9, 1)[:, None] | |
| tl.store(out_ptr0 + x3, tmp9, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 40, 1, 1, 13), (520, 1, 520, 520, 40), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_32.run(*args, 520, 30878, grid=grid(520), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_32.benchmark_all_configs(*args, 520, 30878, grid=grid(520)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/5p/c5pcmnlklk6a6ee2vm3ta2iounjyiftuynwfzbts7sayeuto4jvn.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => add_27, add_29, convert_element_type_21, mul_38, mul_39, mul_40, rsqrt_5, squeeze_16, var_mean_5 | |
| triton_per_fused__native_batch_norm_legit_functional_33 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[64, 16], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__native_batch_norm_legit_functional_33(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 40 | |
| rnumel = 13 | |
| RBLOCK: tl.constexpr = 16 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (40*r1)), rmask & xmask, other=0) | |
| tmp13 = tl.load(in_ptr1 + (x0), xmask) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp4 = 401408.0 | |
| tmp5 = tmp3 / tmp4 | |
| tmp6 = 1e-05 | |
| tmp7 = tmp5 + tmp6 | |
| tmp8 = tl.math.rsqrt(tmp7) | |
| tmp9 = 1.0000024912370735 | |
| tmp10 = tmp5 * tmp9 | |
| tmp11 = 0.1 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = 0.9 | |
| tmp15 = tmp13 * tmp14 | |
| tmp16 = tmp12 + tmp15 | |
| tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask) | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask) | |
| tl.store(out_ptr0 + x0, tmp3, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((1, 40, 1, 1, 13), (520, 1, 520, 520, 40), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__native_batch_norm_legit_functional_33.run(*args, 40, 13, grid=grid(40), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__native_batch_norm_legit_functional_33.benchmark_all_configs(*args, 40, 13, grid=grid(40)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/pf/cpfbdly4ax5zztymk42pnezqf7ys2im4sb437kjrqjpf4x2jbotn.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => add_27, add_30, convert_element_type_21, convert_element_type_22, mul_35, mul_41, rsqrt_5, sub_5, var_mean_5 | |
| triton_poi_fused__native_batch_norm_legit_functional_34 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_34(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 16056320 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 3136) % 40 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp2 = tl.load(in_ptr1 + (x1), None) | |
| tmp4 = tl.load(in_ptr2 + (x1), None) | |
| tmp11 = tl.load(in_ptr3 + (x1), None) | |
| tmp13 = tl.load(in_ptr4 + (x1), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp5 = 401408.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = 1e-05 | |
| tmp8 = tmp6 + tmp7 | |
| tmp9 = tl.math.rsqrt(tmp8) | |
| tmp10 = tmp3 * tmp9 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = tmp12 + tmp13 | |
| tmp15 = tmp14.to(tl.float32) | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_34.run(*args, 16056320, grid=grid(16056320), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_34.benchmark_all_configs(*args, 16056320, grid=grid(16056320)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/jt/cjtdu5nq2b37l5fyi5c7vdk3m4aqhqeqbsw3e5ifo4u74lo5bgaq.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_23 | |
| triton_poi_fused__to_copy_35 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_35(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1200 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((60, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((60, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_35.run(*args, 1200, grid=grid(1200), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_35.benchmark_all_configs(*args, 1200, grid=grid(1200)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/wj/cwjg3r37ehsowju7qerrsqlothtrltxyyuq5mhnulbp7zor3ocw5.py | |
| # Original ATen: aten.cat | |
| # aten.cat => cat_3 | |
| triton_poi_fused_cat_36 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_cat_36(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 24084480 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 188160 | |
| x1 = (xindex // 188160) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + (376320*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 60, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_cat_36.run(*args, 24084480, grid=grid(24084480), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_cat_36.benchmark_all_configs(*args, 24084480, grid=grid(24084480)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/2c/c2cmgfhoarotsw2klaplncsqdxdxrbc3je4bm4idg4672cm2hbzh.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => convert_element_type_25, var_mean_6 | |
| triton_red_fused__native_batch_norm_legit_functional_37 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[512, 131072], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_37(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 480 | |
| rnumel = 100352 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex % 120 | |
| x1 = (xindex // 120) | |
| _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| x3 = xindex | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| tmp0 = tl.load(in_ptr0 + ((3136*x0) + (376320*(r2 // 3136)) + (12042240*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
| tmp2 = tl.sum(_tmp2, 1)[:, None] | |
| tl.store(out_ptr0 + x3, tmp2, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 120, 1, 1, 4), (480, 1, 480, 480, 120), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_37.run(*args, 480, 100352, grid=grid(480), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_37.benchmark_all_configs(*args, 480, 100352, grid=grid(480)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/xq/cxq23fscob3jbjdcsao7sicyvado5nsvykscbw7syq3dh73swjgu.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => add_33, convert_element_type_25, mul_43, mul_44, var_mean_6 | |
| triton_per_fused__native_batch_norm_legit_functional_38 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[128, 4], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__native_batch_norm_legit_functional_38(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 120 | |
| rnumel = 4 | |
| RBLOCK: tl.constexpr = 4 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (120*r1)), rmask & xmask, other=0) | |
| tmp8 = tl.load(in_ptr1 + (x0), xmask) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp4 = 401408.0 | |
| tmp5 = tmp3 / tmp4 | |
| tmp6 = 0.1 | |
| tmp7 = tmp5 * tmp6 | |
| tmp9 = 0.9 | |
| tmp10 = tmp8 * tmp9 | |
| tmp11 = tmp7 + tmp10 | |
| tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((1, 120, 1, 1, 4), (480, 1, 480, 480, 120), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__native_batch_norm_legit_functional_38.run(*args, 120, 4, grid=grid(120), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__native_batch_norm_legit_functional_38.benchmark_all_configs(*args, 120, 4, grid=grid(120)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/wg/cwgir4s67j2t35f66rtqguw4skzrnzaazh3rvukzvjln6bwfquwz.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => convert_element_type_25, var_mean_6 | |
| triton_red_fused__native_batch_norm_legit_functional_39 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[512, 131072], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_39(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 480 | |
| rnumel = 100352 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex % 120 | |
| x1 = (xindex // 120) | |
| tmp2 = tl.load(in_ptr1 + (x0), xmask) | |
| _tmp5 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| x3 = xindex | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| tmp0 = tl.load(in_ptr0 + ((3136*x0) + (376320*(r2 // 3136)) + (12042240*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp4 = tmp3 * tmp3 | |
| _tmp5 = tl.where(rmask & xmask, _tmp5 + tmp4, _tmp5) | |
| tmp5 = tl.sum(_tmp5, 1)[:, None] | |
| tl.store(out_ptr0 + x3, tmp5, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 120, 1, 1, 4), (480, 1, 480, 480, 120), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_39.run(*args, 480, 100352, grid=grid(480), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_39.benchmark_all_configs(*args, 480, 100352, grid=grid(480)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/bd/cbdjjrt2eltvde4ay5q7ftrif7dzdrcgxsonoef3wv6a6n446fg5.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => add_32, add_34, convert_element_type_25, mul_45, mul_46, mul_47, rsqrt_6, squeeze_19, var_mean_6 | |
| triton_per_fused__native_batch_norm_legit_functional_40 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[128, 4], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__native_batch_norm_legit_functional_40(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 120 | |
| rnumel = 4 | |
| RBLOCK: tl.constexpr = 4 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (120*r1)), rmask & xmask, other=0) | |
| tmp13 = tl.load(in_ptr1 + (x0), xmask) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp4 = 401408.0 | |
| tmp5 = tmp3 / tmp4 | |
| tmp6 = 1e-05 | |
| tmp7 = tmp5 + tmp6 | |
| tmp8 = tl.math.rsqrt(tmp7) | |
| tmp9 = 1.0000024912370735 | |
| tmp10 = tmp5 * tmp9 | |
| tmp11 = 0.1 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = 0.9 | |
| tmp15 = tmp13 * tmp14 | |
| tmp16 = tmp12 + tmp15 | |
| tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask) | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask) | |
| tl.store(out_ptr0 + x0, tmp3, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((1, 120, 1, 1, 4), (480, 1, 480, 480, 120), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__native_batch_norm_legit_functional_40.run(*args, 120, 4, grid=grid(120), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__native_batch_norm_legit_functional_40.benchmark_all_configs(*args, 120, 4, grid=grid(120)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ff/cffbiktpj5b36houxciovjfe3g7ji5w5ibbvgjpjdr6m6dxnqiod.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.relu | |
| # aten._native_batch_norm_legit_functional => add_32, add_35, convert_element_type_25, convert_element_type_26, mul_42, mul_48, rsqrt_6, sub_6, var_mean_6 | |
| # aten.relu => relu_4 | |
| triton_poi_fused__native_batch_norm_legit_functional_relu_41 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_relu_41(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 48168960 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 3136) % 120 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp2 = tl.load(in_ptr1 + (x1), None) | |
| tmp4 = tl.load(in_ptr2 + (x1), None) | |
| tmp11 = tl.load(in_ptr3 + (x1), None) | |
| tmp13 = tl.load(in_ptr4 + (x1), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp5 = 401408.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = 1e-05 | |
| tmp8 = tmp6 + tmp7 | |
| tmp9 = tl.math.rsqrt(tmp8) | |
| tmp10 = tmp3 * tmp9 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = tmp12 + tmp13 | |
| tmp15 = tmp14.to(tl.float32) | |
| tmp16 = tl.where(0 != 0, 0, tl.where(0 > tmp15, 0, tmp15)) | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp16, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_relu_41.run(*args, 48168960, grid=grid(48168960), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_relu_41.benchmark_all_configs(*args, 48168960, grid=grid(48168960)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/6y/c6ynio5pu6llppeohanybeuevixt5rgpdgxw24ylgggmbgzjacrt.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_27 | |
| triton_poi_fused__to_copy_42 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_42(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1080 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_42.run(*args, 1080, grid=grid(1080), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_42.benchmark_all_configs(*args, 1080, grid=grid(1080)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/p3/cp3ysrijswutvs72wkv4ypuaradvydgclkx6swvsdiolmqtydjck.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.relu, aten.threshold_backward | |
| # aten._native_batch_norm_legit_functional => add_37, add_40, convert_element_type_28, convert_element_type_29, mul_49, mul_55, rsqrt_7, sub_7, var_mean_7 | |
| # aten.relu => relu_5 | |
| # aten.threshold_backward => le_1 | |
| triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_43 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*i1', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_43(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 48168960 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 3136) % 120 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp2 = tl.load(in_ptr1 + (x1), None) | |
| tmp4 = tl.load(in_ptr2 + (x1), None) | |
| tmp11 = tl.load(in_ptr3 + (x1), None) | |
| tmp13 = tl.load(in_ptr4 + (x1), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp5 = 401408.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = 1e-05 | |
| tmp8 = tmp6 + tmp7 | |
| tmp9 = tl.math.rsqrt(tmp8) | |
| tmp10 = tmp3 * tmp9 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = tmp12 + tmp13 | |
| tmp15 = tmp14.to(tl.float32) | |
| tmp16 = tl.where(0 != 0, 0, tl.where(0 > tmp15, 0, tmp15)) | |
| tmp17 = 0.0 | |
| tmp18 = tmp16 <= tmp17 | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp16, None) | |
| tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp18, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| arg_6 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.bool) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_43.run(*args, 48168960, grid=grid(48168960), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_43.benchmark_all_configs(*args, 48168960, grid=grid(48168960)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/pe/cpea2gtzlc2kbhqh3fj4j3lqcnuxq36vpr46nhhxpvtecppidlkn.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_30 | |
| triton_poi_fused__to_copy_44 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_44(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1200 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((20, 60, 1, 1), (60, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((20, 60, 1, 1), (60, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_44.run(*args, 1200, grid=grid(1200), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_44.benchmark_all_configs(*args, 1200, grid=grid(1200)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/nm/cnmhyqb22hbicwkw553bytawf6yat3hnc5u573hgyyuw5vi2ohc2.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add | |
| # aten._native_batch_norm_legit_functional => add_42, add_45, convert_element_type_32, convert_element_type_33, mul_56, mul_62, rsqrt_8, sub_8, var_mean_8 | |
| # aten.add => add_46 | |
| triton_poi_fused__native_batch_norm_legit_functional_add_45 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_add_45(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 16056320 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 3136) % 40 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp2 = tl.load(in_ptr1 + (x1), None) | |
| tmp4 = tl.load(in_ptr2 + (x1), None) | |
| tmp11 = tl.load(in_ptr3 + (x1), None) | |
| tmp13 = tl.load(in_ptr4 + (x1), None) | |
| tmp16 = tl.load(in_ptr5 + (x3), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp5 = 401408.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = 1e-05 | |
| tmp8 = tmp6 + tmp7 | |
| tmp9 = tl.math.rsqrt(tmp8) | |
| tmp10 = tmp3 * tmp9 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = tmp12 + tmp13 | |
| tmp15 = tmp14.to(tl.float32) | |
| tmp17 = tmp15 + tmp16 | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp17, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| arg_6 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_add_45.run(*args, 16056320, grid=grid(16056320), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_add_45.benchmark_all_configs(*args, 16056320, grid=grid(16056320)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/d6/cd6uk6iyi5u7h7amewzycxkyxriny4cknptmukvtlpxi4jwlerzf.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_34 | |
| triton_poi_fused__to_copy_46 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_46(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 9600 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((240, 40, 1, 1), (40, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((240, 40, 1, 1), (40, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_46.run(*args, 9600, grid=grid(9600), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_46.benchmark_all_configs(*args, 9600, grid=grid(9600)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/p3/cp3dwfqdvlpfdvfvxrfm2lik34mpytuijuqsswm7efzikdczro7d.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => add_48, add_49, add_50, convert_element_type_35, mul_64, mul_65, mul_66, mul_67, mul_68, rsqrt_9, squeeze_28, var_mean_9 | |
| triton_red_fused__native_batch_norm_legit_functional_47 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[256, 524288], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_47(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 240 | |
| rnumel = 401408 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex | |
| _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 3136 | |
| r2 = (rindex // 3136) | |
| tmp0 = tl.load(in_ptr0 + (r1 + (3136*x0) + (752640*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
| tmp2 = tl.sum(_tmp2, 1)[:, None] | |
| tmp7 = tl.load(in_ptr1 + (x0), xmask) | |
| tmp3 = 401408.0 | |
| tmp4 = tmp2 / tmp3 | |
| tmp5 = 0.1 | |
| tmp6 = tmp4 * tmp5 | |
| tmp8 = 0.9 | |
| tmp9 = tmp7 * tmp8 | |
| tmp10 = tmp6 + tmp9 | |
| tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask) | |
| _tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 3136 | |
| r2 = (rindex // 3136) | |
| tmp11 = tl.load(in_ptr0 + (r1 + (3136*x0) + (752640*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp12 = tmp11.to(tl.float32) | |
| tmp13 = tmp12 - tmp4 | |
| tmp14 = tmp13 * tmp13 | |
| _tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15) | |
| tmp15 = tl.sum(_tmp15, 1)[:, None] | |
| tl.store(out_ptr1 + x0, tmp15, xmask) | |
| tmp25 = tl.load(in_ptr2 + (x0), xmask) | |
| tmp16 = 401408.0 | |
| tmp17 = tmp15 / tmp16 | |
| tmp18 = 1e-05 | |
| tmp19 = tmp17 + tmp18 | |
| tmp20 = tl.math.rsqrt(tmp19) | |
| tmp21 = 1.0000024912370735 | |
| tmp22 = tmp17 * tmp21 | |
| tmp23 = 0.1 | |
| tmp24 = tmp22 * tmp23 | |
| tmp26 = 0.9 | |
| tmp27 = tmp25 * tmp26 | |
| tmp28 = tmp24 + tmp27 | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask) | |
| tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_47.run(*args, 240, 401408, grid=grid(240), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_47.benchmark_all_configs(*args, 240, 401408, grid=grid(240)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/bn/cbnwrcxmnryz6qd2vzr5qnxb2q3vepfvhupqz2awsu4zkworteie.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub | |
| # aten._native_batch_norm_legit_functional => add_48, add_51, convert_element_type_35, convert_element_type_36, mul_63, mul_69, rsqrt_9, sub_9, var_mean_9 | |
| # aten.add => add_379 | |
| # aten.clone => clone | |
| # aten.fill => full_like_47 | |
| # aten.mul => mul_1107, mul_1108 | |
| # aten.sigmoid => sigmoid_111 | |
| # aten.sub => sub_313 | |
| triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_48 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[134217728], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_48(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 96337920 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 3136) % 240 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp2 = tl.load(in_ptr1 + (x1), None) | |
| tmp4 = tl.load(in_ptr2 + (x1), None) | |
| tmp11 = tl.load(in_ptr3 + (x1), None) | |
| tmp13 = tl.load(in_ptr4 + (x1), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp5 = 401408.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = 1e-05 | |
| tmp8 = tmp6 + tmp7 | |
| tmp9 = tl.math.rsqrt(tmp8) | |
| tmp10 = tmp3 * tmp9 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = tmp12 + tmp13 | |
| tmp15 = tmp14.to(tl.float32) | |
| tmp16 = tl.sigmoid(tmp15) | |
| tmp17 = 1.0 | |
| tmp18 = tmp17 - tmp16 | |
| tmp19 = tmp15 * tmp18 | |
| tmp20 = tmp19 + tmp17 | |
| tmp21 = tmp16 * tmp20 | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None) | |
| tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp21, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| arg_6 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_48.run(*args, 96337920, grid=grid(96337920), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_48.benchmark_all_configs(*args, 96337920, grid=grid(96337920)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/tr/ctrlm7wxwgyqu6igydzh2qy3dgfyvkbxyf6ed5dmhs5ac43nkywg.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_39 | |
| triton_poi_fused__to_copy_49 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_49(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 540 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((60, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((60, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_49.run(*args, 540, grid=grid(540), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_49.benchmark_all_configs(*args, 540, grid=grid(540)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/62/c62vp5bgz7fqdu2x3oxuvrnwwvzztezlji4zzzxqegy77wuqckqo.py | |
| # Original ATen: aten.split_with_sizes | |
| # aten.split_with_sizes => getitem_52 | |
| triton_poi_fused_split_with_sizes_50 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_split_with_sizes_50(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 24084480 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 188160 | |
| x1 = (xindex // 188160) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (752640*x1)), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_split_with_sizes_50.run(*args, 24084480, grid=grid(24084480), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_split_with_sizes_50.benchmark_all_configs(*args, 24084480, grid=grid(24084480)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/5k/c5k2puyq6twbirobmys3yfadgomf23ufkkjsshzn37upfh5g7erl.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_40 | |
| triton_poi_fused__to_copy_51 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_51(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1500 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((60, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((60, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_51.run(*args, 1500, grid=grid(1500), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_51.benchmark_all_configs(*args, 1500, grid=grid(1500)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/qq/cqqmootvvozbchj4wyxl2cmg4iebq6hs2v2baytcez4tdix7pywu.py | |
| # Original ATen: aten.split_with_sizes | |
| # aten.split_with_sizes => getitem_57 | |
| triton_poi_fused_split_with_sizes_52 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_split_with_sizes_52(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 24084480 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 188160 | |
| x1 = (xindex // 188160) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (188160 + x0 + (752640*x1)), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_split_with_sizes_52.run(*args, 24084480, grid=grid(24084480), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_split_with_sizes_52.benchmark_all_configs(*args, 24084480, grid=grid(24084480)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/hn/chn7fi5d6pb7lt74jnnjikqswlxeu3nbv3mpu3aidnxkwdaw4fmq.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_41 | |
| triton_poi_fused__to_copy_53 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_53(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2940 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((60, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((60, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_53.run(*args, 2940, grid=grid(2940), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_53.benchmark_all_configs(*args, 2940, grid=grid(2940)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/7d/c7d6lruapdg7jujcavuay77hawcbzn2rk3uqz6opu54tj6rxd3dc.py | |
| # Original ATen: aten.split_with_sizes | |
| # aten.split_with_sizes => getitem_62 | |
| triton_poi_fused_split_with_sizes_54 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_split_with_sizes_54(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 24084480 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 188160 | |
| x1 = (xindex // 188160) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (376320 + x0 + (752640*x1)), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_split_with_sizes_54.run(*args, 24084480, grid=grid(24084480), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_split_with_sizes_54.benchmark_all_configs(*args, 24084480, grid=grid(24084480)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ih/cihdtphelycdnabls3bvxcjqdwyigw5444zlobgvmbwdwzxq7zdy.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_42 | |
| triton_poi_fused__to_copy_55 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_55(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 4860 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((60, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((60, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_55.run(*args, 4860, grid=grid(4860), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_55.benchmark_all_configs(*args, 4860, grid=grid(4860)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/oq/coqx7tfdhgzsd3kzw3ueczpktm7gxo5z67ciilgh24ffay7herao.py | |
| # Original ATen: aten.split_with_sizes | |
| # aten.split_with_sizes => getitem_67 | |
| triton_poi_fused_split_with_sizes_56 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_split_with_sizes_56(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 24084480 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 188160 | |
| x1 = (xindex // 188160) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (564480 + x0 + (752640*x1)), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_split_with_sizes_56.run(*args, 24084480, grid=grid(24084480), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_split_with_sizes_56.benchmark_all_configs(*args, 24084480, grid=grid(24084480)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/n6/cn6lwm2ninzuud2fuu3omrklbeewaxqtkv3v6gdaxmikjvyoi525.py | |
| # Original ATen: aten.cat | |
| # aten.cat => cat_5 | |
| triton_poi_fused_cat_57 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_cat_57(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 6021120 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 47040 | |
| x1 = (xindex // 47040) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + (188160*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 60, 28, 28), (47040, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 60, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_cat_57.run(*args, 6021120, grid=grid(6021120), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_cat_57.benchmark_all_configs(*args, 6021120, grid=grid(6021120)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/tm/ctmfxwnpwpks3pe3wok54jftmfykafzupq5buoissf32gfcrrq6d.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => add_53, add_54, add_55, convert_element_type_43, mul_72, mul_73, mul_74, mul_75, mul_76, rsqrt_10, squeeze_31, var_mean_10 | |
| triton_red_fused__native_batch_norm_legit_functional_58 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[256, 131072], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_58(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 240 | |
| rnumel = 100352 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex | |
| _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 784 | |
| r2 = (rindex // 784) | |
| tmp0 = tl.load(in_ptr0 + (r1 + (784*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
| tmp2 = tl.sum(_tmp2, 1)[:, None] | |
| tmp7 = tl.load(in_ptr1 + (x0), xmask) | |
| tmp3 = 100352.0 | |
| tmp4 = tmp2 / tmp3 | |
| tmp5 = 0.1 | |
| tmp6 = tmp4 * tmp5 | |
| tmp8 = 0.9 | |
| tmp9 = tmp7 * tmp8 | |
| tmp10 = tmp6 + tmp9 | |
| tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask) | |
| _tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 784 | |
| r2 = (rindex // 784) | |
| tmp11 = tl.load(in_ptr0 + (r1 + (784*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp12 = tmp11.to(tl.float32) | |
| tmp13 = tmp12 - tmp4 | |
| tmp14 = tmp13 * tmp13 | |
| _tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15) | |
| tmp15 = tl.sum(_tmp15, 1)[:, None] | |
| tl.store(out_ptr1 + x0, tmp15, xmask) | |
| tmp25 = tl.load(in_ptr2 + (x0), xmask) | |
| tmp16 = 100352.0 | |
| tmp17 = tmp15 / tmp16 | |
| tmp18 = 1e-05 | |
| tmp19 = tmp17 + tmp18 | |
| tmp20 = tl.math.rsqrt(tmp19) | |
| tmp21 = 1.00000996502277 | |
| tmp22 = tmp17 * tmp21 | |
| tmp23 = 0.1 | |
| tmp24 = tmp22 * tmp23 | |
| tmp26 = 0.9 | |
| tmp27 = tmp25 * tmp26 | |
| tmp28 = tmp24 + tmp27 | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask) | |
| tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_58.run(*args, 240, 100352, grid=grid(240), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_58.benchmark_all_configs(*args, 240, 100352, grid=grid(240)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/tf/ctf52h6qjyfnbwd7tzysnt6o3dvwaedcis63siipfnbkcfkukqa7.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu | |
| # aten._native_batch_norm_legit_functional => add_53, add_56, convert_element_type_43, convert_element_type_44, mul_71, mul_77, rsqrt_10, sub_10, var_mean_10 | |
| # aten.mean => mean | |
| # aten.silu => convert_element_type_45, convert_element_type_46, mul_78, sigmoid_1 | |
| triton_per_fused__native_batch_norm_legit_functional_mean_silu_59 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[32768, 1024], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__native_batch_norm_legit_functional_mean_silu_59(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 30720 | |
| rnumel = 784 | |
| RBLOCK: tl.constexpr = 1024 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| x3 = xindex | |
| x0 = xindex % 240 | |
| tmp0 = tl.load(in_ptr0 + (r2 + (784*x3)), rmask, other=0).to(tl.float32) | |
| tmp2 = tl.load(in_ptr1 + (x0), None) | |
| tmp4 = tl.load(in_ptr2 + (x0), None) | |
| tmp11 = tl.load(in_ptr3 + (x0), None) | |
| tmp13 = tl.load(in_ptr4 + (x0), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp5 = 100352.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = 1e-05 | |
| tmp8 = tmp6 + tmp7 | |
| tmp9 = tl.math.rsqrt(tmp8) | |
| tmp10 = tmp3 * tmp9 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = tmp12 + tmp13 | |
| tmp15 = tmp14.to(tl.float32) | |
| tmp16 = tmp15.to(tl.float32) | |
| tmp17 = tl.sigmoid(tmp16) | |
| tmp18 = tmp16 * tmp17 | |
| tmp19 = tmp18.to(tl.float32) | |
| tmp20 = tmp19.to(tl.float32) | |
| tmp22 = tl.where(rmask, tmp20, 0) | |
| tmp23 = tl.sum(tmp22, 1)[:, None] | |
| tmp24 = 784.0 | |
| tmp25 = tmp23 / tmp24 | |
| tmp26 = tmp25.to(tl.float32) | |
| tl.store(out_ptr0 + (r2 + (784*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask) | |
| tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_6 = rand_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__native_batch_norm_legit_functional_mean_silu_59.run(*args, 30720, 784, grid=grid(30720), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__native_batch_norm_legit_functional_mean_silu_59.benchmark_all_configs(*args, 30720, 784, grid=grid(30720)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/l4/cl43m5yr7mr6rofdxkde5hoinowoes7zmspib6uw4wfhtoztyd2j.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_48 | |
| triton_poi_fused__to_copy_60 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_60(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 4800 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((20, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((20, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_60.run(*args, 4800, grid=grid(4800), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_60.benchmark_all_configs(*args, 4800, grid=grid(4800)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/n3/cn3yrrvjz2rkdvv5cbgdyj5mk5wgjpljfxyga75evi5tqahwagdd.py | |
| # Original ATen: aten._to_copy, aten.convolution | |
| # aten._to_copy => convert_element_type_47 | |
| # aten.convolution => convolution_20 | |
| triton_poi_fused__to_copy_convolution_61 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[32], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_convolution_61(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 20 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((20,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((20,), (1,), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_convolution_61.run(*args, 20, grid=grid(20), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_convolution_61.benchmark_all_configs(*args, 20, grid=grid(20)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/nz/cnz5cjjblhni4cibgggtjl3kn4ksihmqz6znyn5belgi4q43cncw.py | |
| # Original ATen: aten._to_copy, aten.convolution, aten.silu | |
| # aten._to_copy => convert_element_type_47 | |
| # aten.convolution => convolution_20 | |
| # aten.silu => convert_element_type_49, convert_element_type_50, mul_79, sigmoid_2 | |
| triton_poi_fused__to_copy_convolution_silu_62 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_convolution_silu_62(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2560 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 20 | |
| tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32) | |
| tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp2 = tmp0 + tmp1 | |
| tmp3 = tmp2.to(tl.float32) | |
| tmp4 = tl.sigmoid(tmp3) | |
| tmp5 = tmp3 * tmp4 | |
| tmp6 = tmp5.to(tl.float32) | |
| tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((20,), (1,), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_convolution_silu_62.run(*args, 2560, grid=grid(2560), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_convolution_silu_62.benchmark_all_configs(*args, 2560, grid=grid(2560)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/bl/cblr4zz2jdn5lx7fyq6hto375tcressl73jxrxvozhqbubkoilua.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_52 | |
| triton_poi_fused__to_copy_63 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_63(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 4800 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((240, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((240, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_63.run(*args, 4800, grid=grid(4800), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_63.benchmark_all_configs(*args, 4800, grid=grid(4800)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/gq/cgqatezdistnqv4yyabd7t76wkyduditccoaqjd3ef4npoxk6vrv.py | |
| # Original ATen: aten._to_copy, aten.convolution | |
| # aten._to_copy => convert_element_type_51 | |
| # aten.convolution => convolution_21 | |
| triton_poi_fused__to_copy_convolution_64 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[256], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_convolution_64(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 240 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_convolution_64.run(*args, 240, grid=grid(240), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_convolution_64.benchmark_all_configs(*args, 240, grid=grid(240)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/qr/cqrlwq46vnra4itz3o4m3cvwrnyl7srsa7xlwywcqzni2rafclhf.py | |
| # Original ATen: aten._to_copy, aten.convolution | |
| # aten._to_copy => convert_element_type_51 | |
| # aten.convolution => convolution_21 | |
| triton_poi_fused__to_copy_convolution_65 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_convolution_65(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 30720 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 240 | |
| tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32) | |
| tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32) | |
| tmp2 = tmp0 + tmp1 | |
| tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_convolution_65.run(*args, 30720, grid=grid(30720), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_convolution_65.benchmark_all_configs(*args, 30720, grid=grid(30720)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ru/crubekjujrhilkb2spkvdoiiitif5a5cisikyju6vjbn7j3itjum.py | |
| # Original ATen: aten.mul, aten.sigmoid, aten.silu | |
| # aten.mul => mul_80 | |
| # aten.sigmoid => sigmoid_3 | |
| # aten.silu => convert_element_type_45, convert_element_type_46, mul_78, sigmoid_1 | |
| triton_poi_fused_mul_sigmoid_silu_66 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_mul_sigmoid_silu_66(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 24084480 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x1 = (xindex // 784) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp3.to(tl.float32) | |
| tmp6 = tl.sigmoid(tmp5) | |
| tmp7 = tmp4 * tmp6 | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_mul_sigmoid_silu_66.run(*args, 24084480, grid=grid(24084480), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_mul_sigmoid_silu_66.benchmark_all_configs(*args, 24084480, grid=grid(24084480)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/va/cvacncosevkh7a4e4mjytw5gvejhihlmvkwma42dwu2nfhypnbzr.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_53 | |
| triton_poi_fused__to_copy_67 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_67(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 13440 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((56, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((56, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_67.run(*args, 13440, grid=grid(13440), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_67.benchmark_all_configs(*args, 13440, grid=grid(13440)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/hv/chvkvxc6xaoyzdfyhbbbolrujmg4wuzt7gkh6x4thkny6u2hr4gg.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => convert_element_type_54, var_mean_11 | |
| triton_red_fused__native_batch_norm_legit_functional_68 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[1024, 8192], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_68(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 728 | |
| rnumel = 7720 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x1 = (xindex // 56) | |
| x0 = xindex % 56 | |
| _tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| x3 = xindex | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| tmp0 = r2 + (7720*x1) | |
| tmp1 = 100352 | |
| tmp2 = tmp0 < tmp1 | |
| tmp3 = tl.load(in_ptr0 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp4 = tmp3.to(tl.float32) | |
| tmp5 = tl.where(tmp2, tmp4, 0) | |
| _tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6) | |
| tmp6 = tl.sum(_tmp6, 1)[:, None] | |
| tl.store(out_ptr0 + x3, tmp6, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 56, 1, 1, 13), (728, 1, 728, 728, 56), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_68.run(*args, 728, 7720, grid=grid(728), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_68.benchmark_all_configs(*args, 728, 7720, grid=grid(728)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ih/cihgiw3fxfncw6wmlie2fnq4egrnxl6aq2aj37mt4r5kpoemk2yw.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => add_59, convert_element_type_54, mul_82, mul_83, var_mean_11 | |
| triton_per_fused__native_batch_norm_legit_functional_69 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[64, 16], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__native_batch_norm_legit_functional_69(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 56 | |
| rnumel = 13 | |
| RBLOCK: tl.constexpr = 16 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (56*r1)), rmask & xmask, other=0) | |
| tmp8 = tl.load(in_ptr1 + (x0), xmask) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp4 = 100352.0 | |
| tmp5 = tmp3 / tmp4 | |
| tmp6 = 0.1 | |
| tmp7 = tmp5 * tmp6 | |
| tmp9 = 0.9 | |
| tmp10 = tmp8 * tmp9 | |
| tmp11 = tmp7 + tmp10 | |
| tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((1, 56, 1, 1, 13), (728, 1, 728, 728, 56), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__native_batch_norm_legit_functional_69.run(*args, 56, 13, grid=grid(56), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__native_batch_norm_legit_functional_69.benchmark_all_configs(*args, 56, 13, grid=grid(56)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/n3/cn3m3v5dwzlicmheht7t2sqzrzgtbufmioizlp6yemcka6kewuok.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => convert_element_type_54, var_mean_11 | |
| triton_red_fused__native_batch_norm_legit_functional_70 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[1024, 8192], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_70(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 728 | |
| rnumel = 7720 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x1 = (xindex // 56) | |
| x0 = xindex % 56 | |
| _tmp9 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| x3 = xindex | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| tmp0 = r2 + (7720*x1) | |
| tmp1 = 100352 | |
| tmp2 = tmp0 < tmp1 | |
| tmp3 = tl.load(in_ptr0 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp4 = tmp3.to(tl.float32) | |
| tmp5 = tl.load(in_ptr1 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0) | |
| tmp6 = tmp4 - tmp5 | |
| tmp7 = tmp6 * tmp6 | |
| tmp8 = tl.where(tmp2, tmp7, 0) | |
| _tmp9 = tl.where(rmask & xmask, _tmp9 + tmp8, _tmp9) | |
| tmp9 = tl.sum(_tmp9, 1)[:, None] | |
| tl.store(out_ptr0 + x3, tmp9, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 56, 1, 1, 13), (728, 1, 728, 728, 56), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_70.run(*args, 728, 7720, grid=grid(728), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_70.benchmark_all_configs(*args, 728, 7720, grid=grid(728)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/xv/cxvq5i34k2bihobv4ebll2eevqva3mot7bow45ecoiyqxocbuesz.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => add_58, add_60, convert_element_type_54, mul_84, mul_85, mul_86, rsqrt_11, squeeze_34, var_mean_11 | |
| triton_per_fused__native_batch_norm_legit_functional_71 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[64, 16], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__native_batch_norm_legit_functional_71(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 56 | |
| rnumel = 13 | |
| RBLOCK: tl.constexpr = 16 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (56*r1)), rmask & xmask, other=0) | |
| tmp13 = tl.load(in_ptr1 + (x0), xmask) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp4 = 100352.0 | |
| tmp5 = tmp3 / tmp4 | |
| tmp6 = 1e-05 | |
| tmp7 = tmp5 + tmp6 | |
| tmp8 = tl.math.rsqrt(tmp7) | |
| tmp9 = 1.00000996502277 | |
| tmp10 = tmp5 * tmp9 | |
| tmp11 = 0.1 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = 0.9 | |
| tmp15 = tmp13 * tmp14 | |
| tmp16 = tmp12 + tmp15 | |
| tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask) | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask) | |
| tl.store(out_ptr0 + x0, tmp3, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((1, 56, 1, 1, 13), (728, 1, 728, 728, 56), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__native_batch_norm_legit_functional_71.run(*args, 56, 13, grid=grid(56), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__native_batch_norm_legit_functional_71.benchmark_all_configs(*args, 56, 13, grid=grid(56)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/jw/cjwfycxx2mrgx2fsw6tfls7yyigmo7hmavgpkr2ob23aopbzsnjc.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => add_58, add_61, convert_element_type_54, convert_element_type_55, mul_81, mul_87, rsqrt_11, sub_11, var_mean_11 | |
| triton_poi_fused__native_batch_norm_legit_functional_72 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_72(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 5619712 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 784) % 56 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp2 = tl.load(in_ptr1 + (x1), None) | |
| tmp4 = tl.load(in_ptr2 + (x1), None) | |
| tmp11 = tl.load(in_ptr3 + (x1), None) | |
| tmp13 = tl.load(in_ptr4 + (x1), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp5 = 100352.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = 1e-05 | |
| tmp8 = tmp6 + tmp7 | |
| tmp9 = tl.math.rsqrt(tmp8) | |
| tmp10 = tmp3 * tmp9 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = tmp12 + tmp13 | |
| tmp15 = tmp14.to(tl.float32) | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_72.run(*args, 5619712, grid=grid(5619712), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_72.benchmark_all_configs(*args, 5619712, grid=grid(5619712)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/7e/c7eyndtglwpd6eg2g74cubgx2m45rn7sikd7mb4t4ucitntos24j.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_56 | |
| triton_poi_fused__to_copy_73 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_73(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 4704 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_73.run(*args, 4704, grid=grid(4704), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_73.benchmark_all_configs(*args, 4704, grid=grid(4704)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/g7/cg7zj3gxcfnefxp4hvh24f2ahem4qdf4cwmaspxgokny23kr7rdc.py | |
| # Original ATen: aten.cat | |
| # aten.cat => cat_6 | |
| triton_poi_fused_cat_74 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_cat_74(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 16859136 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 131712 | |
| x1 = (xindex // 131712) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + (263424*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 168, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_cat_74.run(*args, 16859136, grid=grid(16859136), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_cat_74.benchmark_all_configs(*args, 16859136, grid=grid(16859136)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ea/cea2p3bterhsxcizlwjuqgqdcdagptpjutjfoisqqtcxx4bo46r2.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => add_63, add_64, add_65, convert_element_type_58, mul_89, mul_90, mul_91, mul_92, mul_93, rsqrt_12, squeeze_37, var_mean_12 | |
| triton_red_fused__native_batch_norm_legit_functional_75 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[512, 131072], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_75(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 336 | |
| rnumel = 100352 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex | |
| _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 784 | |
| r2 = (rindex // 784) | |
| tmp0 = tl.load(in_ptr0 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
| tmp2 = tl.sum(_tmp2, 1)[:, None] | |
| tmp7 = tl.load(in_ptr1 + (x0), xmask) | |
| tmp3 = 100352.0 | |
| tmp4 = tmp2 / tmp3 | |
| tmp5 = 0.1 | |
| tmp6 = tmp4 * tmp5 | |
| tmp8 = 0.9 | |
| tmp9 = tmp7 * tmp8 | |
| tmp10 = tmp6 + tmp9 | |
| tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask) | |
| _tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 784 | |
| r2 = (rindex // 784) | |
| tmp11 = tl.load(in_ptr0 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp12 = tmp11.to(tl.float32) | |
| tmp13 = tmp12 - tmp4 | |
| tmp14 = tmp13 * tmp13 | |
| _tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15) | |
| tmp15 = tl.sum(_tmp15, 1)[:, None] | |
| tl.store(out_ptr1 + x0, tmp15, xmask) | |
| tmp25 = tl.load(in_ptr2 + (x0), xmask) | |
| tmp16 = 100352.0 | |
| tmp17 = tmp15 / tmp16 | |
| tmp18 = 1e-05 | |
| tmp19 = tmp17 + tmp18 | |
| tmp20 = tl.math.rsqrt(tmp19) | |
| tmp21 = 1.00000996502277 | |
| tmp22 = tmp17 * tmp21 | |
| tmp23 = 0.1 | |
| tmp24 = tmp22 * tmp23 | |
| tmp26 = 0.9 | |
| tmp27 = tmp25 * tmp26 | |
| tmp28 = tmp24 + tmp27 | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask) | |
| tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_75.run(*args, 336, 100352, grid=grid(336), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_75.benchmark_all_configs(*args, 336, 100352, grid=grid(336)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/3o/c3odnwpkqtnjm6osw5x7eu5soiuoiv27n2m7cqobgwum2f4c5rgk.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub | |
| # aten._native_batch_norm_legit_functional => add_63, add_66, convert_element_type_58, convert_element_type_59, mul_88, mul_94, rsqrt_12, sub_12, var_mean_12 | |
| # aten.add => add_374 | |
| # aten.clone => clone_3 | |
| # aten.fill => full_like_44 | |
| # aten.mul => mul_1067, mul_1068 | |
| # aten.sigmoid => sigmoid_108 | |
| # aten.sub => sub_297 | |
| triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 33718272 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 784) % 336 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp2 = tl.load(in_ptr1 + (x1), None) | |
| tmp4 = tl.load(in_ptr2 + (x1), None) | |
| tmp11 = tl.load(in_ptr3 + (x1), None) | |
| tmp13 = tl.load(in_ptr4 + (x1), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp5 = 100352.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = 1e-05 | |
| tmp8 = tmp6 + tmp7 | |
| tmp9 = tl.math.rsqrt(tmp8) | |
| tmp10 = tmp3 * tmp9 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = tmp12 + tmp13 | |
| tmp15 = tmp14.to(tl.float32) | |
| tmp16 = tl.sigmoid(tmp15) | |
| tmp17 = 1.0 | |
| tmp18 = tmp17 - tmp16 | |
| tmp19 = tmp15 * tmp18 | |
| tmp20 = tmp19 + tmp17 | |
| tmp21 = tmp16 * tmp20 | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None) | |
| tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp21, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_6 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76.run(*args, 33718272, grid=grid(33718272), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76.benchmark_all_configs(*args, 33718272, grid=grid(33718272)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/nw/cnwbbjysyivhwwej3cvtzavdj6jvinxuti5ndhcg3qzuf77glcrt.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_62 | |
| triton_poi_fused__to_copy_77 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_77(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1512 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((168, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((168, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_77.run(*args, 1512, grid=grid(1512), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_77.benchmark_all_configs(*args, 1512, grid=grid(1512)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/fz/cfzmnx3o626lf5s63xg32x6is4tmygw4nxdhdigpofuif52vxrzm.py | |
| # Original ATen: aten.split_with_sizes | |
| # aten.split_with_sizes => getitem_78 | |
| triton_poi_fused_split_with_sizes_78 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_split_with_sizes_78(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 16859136 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 131712 | |
| x1 = (xindex // 131712) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (263424*x1)), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_split_with_sizes_78.run(*args, 16859136, grid=grid(16859136), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_split_with_sizes_78.benchmark_all_configs(*args, 16859136, grid=grid(16859136)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/2r/c2rv7mzu7my2opxvcm4z3utkh7llovacnk6pwuzns4ehc3sgey2e.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_63 | |
| triton_poi_fused__to_copy_79 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_79(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 4200 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((168, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((168, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_79.run(*args, 4200, grid=grid(4200), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_79.benchmark_all_configs(*args, 4200, grid=grid(4200)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/pt/cptqmg7k6n2us7i5wywz2bwg6fjwqyaknishdkved5rkdpml5nw5.py | |
| # Original ATen: aten.split_with_sizes | |
| # aten.split_with_sizes => getitem_81 | |
| triton_poi_fused_split_with_sizes_80 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_split_with_sizes_80(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 16859136 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 131712 | |
| x1 = (xindex // 131712) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (131712 + x0 + (263424*x1)), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_split_with_sizes_80.run(*args, 16859136, grid=grid(16859136), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_split_with_sizes_80.benchmark_all_configs(*args, 16859136, grid=grid(16859136)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/jj/cjj6wuzjfwt7xigucunb4hdetjt4mr4ldxzq5ab4kxddbetbjdu4.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu | |
| # aten._native_batch_norm_legit_functional => add_68, add_71, convert_element_type_64, convert_element_type_65, mul_102, mul_96, rsqrt_13, sub_13, var_mean_13 | |
| # aten.mean => mean_1 | |
| # aten.silu => convert_element_type_66, convert_element_type_67, mul_103, sigmoid_5 | |
| triton_per_fused__native_batch_norm_legit_functional_mean_silu_81 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[65536, 1024], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__native_batch_norm_legit_functional_mean_silu_81(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 43008 | |
| rnumel = 784 | |
| RBLOCK: tl.constexpr = 1024 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| x3 = xindex | |
| x0 = xindex % 336 | |
| tmp0 = tl.load(in_ptr0 + (r2 + (784*x3)), rmask, other=0).to(tl.float32) | |
| tmp2 = tl.load(in_ptr1 + (x0), None) | |
| tmp4 = tl.load(in_ptr2 + (x0), None) | |
| tmp11 = tl.load(in_ptr3 + (x0), None) | |
| tmp13 = tl.load(in_ptr4 + (x0), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp5 = 100352.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = 1e-05 | |
| tmp8 = tmp6 + tmp7 | |
| tmp9 = tl.math.rsqrt(tmp8) | |
| tmp10 = tmp3 * tmp9 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = tmp12 + tmp13 | |
| tmp15 = tmp14.to(tl.float32) | |
| tmp16 = tmp15.to(tl.float32) | |
| tmp17 = tl.sigmoid(tmp16) | |
| tmp18 = tmp16 * tmp17 | |
| tmp19 = tmp18.to(tl.float32) | |
| tmp20 = tmp19.to(tl.float32) | |
| tmp22 = tl.where(rmask, tmp20, 0) | |
| tmp23 = tl.sum(tmp22, 1)[:, None] | |
| tmp24 = 784.0 | |
| tmp25 = tmp23 / tmp24 | |
| tmp26 = tmp25.to(tl.float32) | |
| tl.store(out_ptr0 + (r2 + (784*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask) | |
| tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_6 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__native_batch_norm_legit_functional_mean_silu_81.run(*args, 43008, 784, grid=grid(43008), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__native_batch_norm_legit_functional_mean_silu_81.benchmark_all_configs(*args, 43008, 784, grid=grid(43008)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/qi/cqigsniw4k2557tsg6szy4sxc76yhclqkbczo3sje6z3gkbkxim2.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_69 | |
| triton_poi_fused__to_copy_82 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_82(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 9408 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((28, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((28, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_82.run(*args, 9408, grid=grid(9408), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_82.benchmark_all_configs(*args, 9408, grid=grid(9408)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/qr/cqrzuzcrjaui22653nognmjx7khvfhrb54plybvtrgjuggwhd7ca.py | |
| # Original ATen: aten._to_copy, aten.convolution | |
| # aten._to_copy => convert_element_type_68 | |
| # aten.convolution => convolution_27 | |
| triton_poi_fused__to_copy_convolution_83 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[32], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_convolution_83(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 28 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((28,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((28,), (1,), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_convolution_83.run(*args, 28, grid=grid(28), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_convolution_83.benchmark_all_configs(*args, 28, grid=grid(28)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/5j/c5jenet4bcjhmp6nmplvispvr7rwhrt4fonprb4atbj2oqpb6yhc.py | |
| # Original ATen: aten._to_copy, aten.convolution, aten.silu | |
| # aten._to_copy => convert_element_type_68 | |
| # aten.convolution => convolution_27 | |
| # aten.silu => convert_element_type_70, convert_element_type_71, mul_104, sigmoid_6 | |
| triton_poi_fused__to_copy_convolution_silu_84 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_convolution_silu_84(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3584 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 28 | |
| tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32) | |
| tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp2 = tmp0 + tmp1 | |
| tmp3 = tmp2.to(tl.float32) | |
| tmp4 = tl.sigmoid(tmp3) | |
| tmp5 = tmp3 * tmp4 | |
| tmp6 = tmp5.to(tl.float32) | |
| tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((28,), (1,), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_convolution_silu_84.run(*args, 3584, grid=grid(3584), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_convolution_silu_84.benchmark_all_configs(*args, 3584, grid=grid(3584)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/6q/c6qndeyorjgk72ipb7s7xexoyu3w64lakoglh4m45doekpaj4mk5.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_73 | |
| triton_poi_fused__to_copy_85 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_85(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 9408 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((336, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((336, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_85.run(*args, 9408, grid=grid(9408), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_85.benchmark_all_configs(*args, 9408, grid=grid(9408)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/e4/ce4gppgjp566e4o7bpfsqwx6ehxfdqd5kcqjzpnjkl7lictvy34j.py | |
| # Original ATen: aten._to_copy, aten.convolution | |
| # aten._to_copy => convert_element_type_72 | |
| # aten.convolution => convolution_28 | |
| triton_poi_fused__to_copy_convolution_86 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[512], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_convolution_86(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 336 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_convolution_86.run(*args, 336, grid=grid(336), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_convolution_86.benchmark_all_configs(*args, 336, grid=grid(336)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/6n/c6nqtrrohcuy7rc3a6jzkibcrswmbwqy32tjb2t44xhm5lv3c7ca.py | |
| # Original ATen: aten._to_copy, aten.convolution | |
| # aten._to_copy => convert_element_type_72 | |
| # aten.convolution => convolution_28 | |
| triton_poi_fused__to_copy_convolution_87 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_convolution_87(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 43008 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 336 | |
| tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32) | |
| tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32) | |
| tmp2 = tmp0 + tmp1 | |
| tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_convolution_87.run(*args, 43008, grid=grid(43008), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_convolution_87.benchmark_all_configs(*args, 43008, grid=grid(43008)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/tg/ctg6frnekhgugkbeugokpbetskj6nj7dyov4tmhfv6akngrmr3zz.py | |
| # Original ATen: aten.mul, aten.sigmoid, aten.silu | |
| # aten.mul => mul_105 | |
| # aten.sigmoid => sigmoid_7 | |
| # aten.silu => convert_element_type_66, convert_element_type_67, mul_103, sigmoid_5 | |
| triton_poi_fused_mul_sigmoid_silu_88 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_mul_sigmoid_silu_88(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 33718272 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x1 = (xindex // 784) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp3.to(tl.float32) | |
| tmp6 = tl.sigmoid(tmp5) | |
| tmp7 = tmp4 * tmp6 | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_mul_sigmoid_silu_88.run(*args, 33718272, grid=grid(33718272), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_mul_sigmoid_silu_88.benchmark_all_configs(*args, 33718272, grid=grid(33718272)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/uj/cujd4wpttqmud7zdg64vlcs4ef2kzahlsvrue6dsbdgqyeobae26.py | |
| # Original ATen: aten.split_with_sizes | |
| # aten.split_with_sizes => getitem_84 | |
| triton_poi_fused_split_with_sizes_89 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_split_with_sizes_89(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 16859136 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 131712 | |
| x1 = (xindex // 131712) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (263424*x1)), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_split_with_sizes_89.run(*args, 16859136, grid=grid(16859136), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_split_with_sizes_89.benchmark_all_configs(*args, 16859136, grid=grid(16859136)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/f3/cf3fhfcwg6xxca6ynhdijefpawpp34qudotfwxc7l4ytcq3zjpuo.py | |
| # Original ATen: aten.split_with_sizes | |
| # aten.split_with_sizes => getitem_85 | |
| triton_poi_fused_split_with_sizes_90 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_split_with_sizes_90(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 16859136 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 131712 | |
| x1 = (xindex // 131712) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (131712 + x0 + (263424*x1)), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_split_with_sizes_90.run(*args, 16859136, grid=grid(16859136), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_split_with_sizes_90.benchmark_all_configs(*args, 16859136, grid=grid(16859136)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/mu/cmu55cboklppuar2bfoeffop2b46e55c3whe4q3ppvxcrz2secws.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_74 | |
| triton_poi_fused__to_copy_91 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_91(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 4704 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_91.run(*args, 4704, grid=grid(4704), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_91.benchmark_all_configs(*args, 4704, grid=grid(4704)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/p7/cp7uk2n6iplc4okgdyrcwjh6klja4hwa7hip53lwkxv2iafhns22.py | |
| # Original ATen: aten.cat | |
| # aten.cat => cat_8 | |
| triton_poi_fused_cat_92 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_cat_92(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2809856 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 21952 | |
| x1 = (xindex // 21952) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + (43904*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 28, 28, 28), (21952, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 28, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_cat_92.run(*args, 2809856, grid=grid(2809856), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_cat_92.benchmark_all_configs(*args, 2809856, grid=grid(2809856)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/go/cgo47t53ri3q3acehvndov22nzujsisq6mopaip7tcxdzexe45pe.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add | |
| # aten._native_batch_norm_legit_functional => add_73, add_76, convert_element_type_76, convert_element_type_77, mul_106, mul_112, rsqrt_14, sub_14, var_mean_14 | |
| # aten.add => add_77 | |
| triton_poi_fused__native_batch_norm_legit_functional_add_93 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_add_93(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 5619712 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 784) % 56 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp2 = tl.load(in_ptr1 + (x1), None) | |
| tmp4 = tl.load(in_ptr2 + (x1), None) | |
| tmp11 = tl.load(in_ptr3 + (x1), None) | |
| tmp13 = tl.load(in_ptr4 + (x1), None) | |
| tmp16 = tl.load(in_ptr5 + (x3), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp5 = 100352.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = 1e-05 | |
| tmp8 = tmp6 + tmp7 | |
| tmp9 = tl.math.rsqrt(tmp8) | |
| tmp10 = tmp3 * tmp9 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = tmp12 + tmp13 | |
| tmp15 = tmp14.to(tl.float32) | |
| tmp17 = tmp15 + tmp16 | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp17, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_6 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_add_93.run(*args, 5619712, grid=grid(5619712), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_add_93.benchmark_all_configs(*args, 5619712, grid=grid(5619712)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/cm/ccmet4jdw3a22ehivtutado4ilowri7xe4wvz55tt3jrgkttrcxr.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_122 | |
| triton_poi_fused__to_copy_94 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_94(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 18816 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((336, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((336, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_94.run(*args, 18816, grid=grid(18816), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_94.benchmark_all_configs(*args, 18816, grid=grid(18816)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/2z/c2zspsupbpoyi6v25rww6qizjecactzymkuzhmgwbksaajrrvebj.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_127 | |
| triton_poi_fused__to_copy_95 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_95(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1008 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((112, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((112, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_95.run(*args, 1008, grid=grid(1008), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_95.benchmark_all_configs(*args, 1008, grid=grid(1008)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/vt/cvthhjpoxq63zz4mggqx2vqnhtco6vgp67jmycwwem5t4nuguc3d.py | |
| # Original ATen: aten.split_with_sizes | |
| # aten.split_with_sizes => getitem_125 | |
| triton_poi_fused_split_with_sizes_96 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_split_with_sizes_96(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 11239424 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 87808 | |
| x1 = (xindex // 87808) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (263424*x1)), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 112, 28, 28), (87808, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_split_with_sizes_96.run(*args, 11239424, grid=grid(11239424), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_split_with_sizes_96.benchmark_all_configs(*args, 11239424, grid=grid(11239424)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/67/c67f43otxwovlhoxcmgghbvuamebcise6w4hyt6kxev6rjigyz6t.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_128 | |
| triton_poi_fused__to_copy_97 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_97(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2800 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((112, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((112, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_97.run(*args, 2800, grid=grid(2800), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_97.benchmark_all_configs(*args, 2800, grid=grid(2800)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/mp/cmpup3qxitd5cnywi6sc5ctsik3jls3ax4u6bwwg7l22muv3kqac.py | |
| # Original ATen: aten.split_with_sizes | |
| # aten.split_with_sizes => getitem_129 | |
| triton_poi_fused_split_with_sizes_98 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_split_with_sizes_98(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 11239424 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 87808 | |
| x1 = (xindex // 87808) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (87808 + x0 + (263424*x1)), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 112, 28, 28), (87808, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_split_with_sizes_98.run(*args, 11239424, grid=grid(11239424), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_split_with_sizes_98.benchmark_all_configs(*args, 11239424, grid=grid(11239424)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/hj/chjntoqpyuayr43gwoduaxa3dcaerfocp4evoj27g5z63hwjt7yd.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_129 | |
| triton_poi_fused__to_copy_99 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_99(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 5488 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((112, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((112, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_99.run(*args, 5488, grid=grid(5488), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_99.benchmark_all_configs(*args, 5488, grid=grid(5488)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/5y/c5yntu5zez2ni5e2qpl6ynioug5zthjmijm3yvywigtt4blzf6l6.py | |
| # Original ATen: aten.split_with_sizes | |
| # aten.split_with_sizes => getitem_133 | |
| triton_poi_fused_split_with_sizes_100 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_split_with_sizes_100(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 11239424 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 87808 | |
| x1 = (xindex // 87808) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (175616 + x0 + (263424*x1)), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 112, 28, 28), (87808, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_split_with_sizes_100.run(*args, 11239424, grid=grid(11239424), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_split_with_sizes_100.benchmark_all_configs(*args, 11239424, grid=grid(11239424)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/hi/chiefsb3dple2qr2zd5ph5efd2rhpwu722ib7qhnwdnwgxusl6o7.py | |
| # Original ATen: aten.cat | |
| # aten.cat => cat_15 | |
| triton_poi_fused_cat_101 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_cat_101(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2809856 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 21952 | |
| x1 = (xindex // 21952) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + (65856*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 112, 14, 14), (21952, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 112, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_cat_101.run(*args, 2809856, grid=grid(2809856), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_cat_101.benchmark_all_configs(*args, 2809856, grid=grid(2809856)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/hj/chj56tyq633bknygfo23skutbtotcilbxvxls3o3esvqkeosnqzu.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => add_116, add_117, add_118, convert_element_type_130, mul_172, mul_173, mul_174, mul_175, mul_176, rsqrt_22, squeeze_67, var_mean_22 | |
| triton_red_fused__native_batch_norm_legit_functional_102 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[512, 32768], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_102(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 336 | |
| rnumel = 25088 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex | |
| _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 196 | |
| r2 = (rindex // 196) | |
| tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (65856*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
| tmp2 = tl.sum(_tmp2, 1)[:, None] | |
| tmp7 = tl.load(in_ptr1 + (x0), xmask) | |
| tmp3 = 25088.0 | |
| tmp4 = tmp2 / tmp3 | |
| tmp5 = 0.1 | |
| tmp6 = tmp4 * tmp5 | |
| tmp8 = 0.9 | |
| tmp9 = tmp7 * tmp8 | |
| tmp10 = tmp6 + tmp9 | |
| tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask) | |
| _tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 196 | |
| r2 = (rindex // 196) | |
| tmp11 = tl.load(in_ptr0 + (r1 + (196*x0) + (65856*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp12 = tmp11.to(tl.float32) | |
| tmp13 = tmp12 - tmp4 | |
| tmp14 = tmp13 * tmp13 | |
| _tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15) | |
| tmp15 = tl.sum(_tmp15, 1)[:, None] | |
| tl.store(out_ptr1 + x0, tmp15, xmask) | |
| tmp25 = tl.load(in_ptr2 + (x0), xmask) | |
| tmp16 = 25088.0 | |
| tmp17 = tmp15 / tmp16 | |
| tmp18 = 1e-05 | |
| tmp19 = tmp17 + tmp18 | |
| tmp20 = tl.math.rsqrt(tmp19) | |
| tmp21 = 1.0000398612827361 | |
| tmp22 = tmp17 * tmp21 | |
| tmp23 = 0.1 | |
| tmp24 = tmp22 * tmp23 | |
| tmp26 = 0.9 | |
| tmp27 = tmp25 * tmp26 | |
| tmp28 = tmp24 + tmp27 | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask) | |
| tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_102.run(*args, 336, 25088, grid=grid(336), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_102.benchmark_all_configs(*args, 336, 25088, grid=grid(336)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/zj/czji7shyj4kk6bl2rcdf5euq7ku4vpm3paub7sdz5rtqhc3o2xxw.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu | |
| # aten._native_batch_norm_legit_functional => add_116, add_119, convert_element_type_130, convert_element_type_131, mul_171, mul_177, rsqrt_22, sub_22, var_mean_22 | |
| # aten.mean => mean_4 | |
| # aten.silu => convert_element_type_132, convert_element_type_133, mul_178, sigmoid_17 | |
| triton_per_fused__native_batch_norm_legit_functional_mean_silu_103 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[65536, 256], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__native_batch_norm_legit_functional_mean_silu_103(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 43008 | |
| rnumel = 196 | |
| RBLOCK: tl.constexpr = 256 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| x3 = xindex | |
| x0 = xindex % 336 | |
| tmp0 = tl.load(in_ptr0 + (r2 + (196*x3)), rmask, other=0).to(tl.float32) | |
| tmp2 = tl.load(in_ptr1 + (x0), None) | |
| tmp4 = tl.load(in_ptr2 + (x0), None) | |
| tmp11 = tl.load(in_ptr3 + (x0), None) | |
| tmp13 = tl.load(in_ptr4 + (x0), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp5 = 25088.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = 1e-05 | |
| tmp8 = tmp6 + tmp7 | |
| tmp9 = tl.math.rsqrt(tmp8) | |
| tmp10 = tmp3 * tmp9 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = tmp12 + tmp13 | |
| tmp15 = tmp14.to(tl.float32) | |
| tmp16 = tmp15.to(tl.float32) | |
| tmp17 = tl.sigmoid(tmp16) | |
| tmp18 = tmp16 * tmp17 | |
| tmp19 = tmp18.to(tl.float32) | |
| tmp20 = tmp19.to(tl.float32) | |
| tmp22 = tl.where(rmask, tmp20, 0) | |
| tmp23 = tl.sum(tmp22, 1)[:, None] | |
| tmp24 = 196.0 | |
| tmp25 = tmp23 / tmp24 | |
| tmp26 = tmp25.to(tl.float32) | |
| tl.store(out_ptr0 + (r2 + (196*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask) | |
| tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_6 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__native_batch_norm_legit_functional_mean_silu_103.run(*args, 43008, 196, grid=grid(43008), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__native_batch_norm_legit_functional_mean_silu_103.benchmark_all_configs(*args, 43008, 196, grid=grid(43008)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/vd/cvdkhtrpiolmsts2zro46mftpapkihzk36p6zhnrps3p4cq3krmr.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_135 | |
| triton_poi_fused__to_copy_104 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_104(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 4704 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((14, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((14, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_104.run(*args, 4704, grid=grid(4704), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_104.benchmark_all_configs(*args, 4704, grid=grid(4704)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/tr/ctr74i6h7fbsohivhhoxxpx2ftnqmpxexofhzvpjxs3qwsbp6lt2.py | |
| # Original ATen: aten._to_copy, aten.convolution | |
| # aten._to_copy => convert_element_type_134 | |
| # aten.convolution => convolution_51 | |
| triton_poi_fused__to_copy_convolution_105 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_convolution_105(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 14 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((14,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((14,), (1,), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_convolution_105.run(*args, 14, grid=grid(14), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_convolution_105.benchmark_all_configs(*args, 14, grid=grid(14)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ko/ckojvusnljgriux3ampytpmwhilvjdtesa7avk7kubgd6p562p34.py | |
| # Original ATen: aten._to_copy, aten.convolution, aten.silu | |
| # aten._to_copy => convert_element_type_134 | |
| # aten.convolution => convolution_51 | |
| # aten.silu => convert_element_type_136, convert_element_type_137, mul_179, sigmoid_18 | |
| triton_poi_fused__to_copy_convolution_silu_106 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_convolution_silu_106(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1792 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 14 | |
| tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32) | |
| tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp2 = tmp0 + tmp1 | |
| tmp3 = tmp2.to(tl.float32) | |
| tmp4 = tl.sigmoid(tmp3) | |
| tmp5 = tmp3 * tmp4 | |
| tmp6 = tmp5.to(tl.float32) | |
| tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((14,), (1,), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_convolution_silu_106.run(*args, 1792, grid=grid(1792), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_convolution_silu_106.benchmark_all_configs(*args, 1792, grid=grid(1792)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/wd/cwdr2zuaja73on5ahrizfe4fw5rmvg66qvnjezuugxyt6mvgktuo.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_139 | |
| triton_poi_fused__to_copy_107 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_107(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 4704 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((336, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((336, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_107.run(*args, 4704, grid=grid(4704), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_107.benchmark_all_configs(*args, 4704, grid=grid(4704)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/i4/ci4cud6xszhpzrrtirjqfbkilqlzpxjq4f4rcbjmyxesmegdihcj.py | |
| # Original ATen: aten.mul, aten.sigmoid, aten.silu | |
| # aten.mul => mul_180 | |
| # aten.sigmoid => sigmoid_19 | |
| # aten.silu => convert_element_type_132, convert_element_type_133, mul_178, sigmoid_17 | |
| triton_poi_fused_mul_sigmoid_silu_108 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_mul_sigmoid_silu_108(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 8429568 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x1 = (xindex // 196) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp3.to(tl.float32) | |
| tmp6 = tl.sigmoid(tmp5) | |
| tmp7 = tmp4 * tmp6 | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_mul_sigmoid_silu_108.run(*args, 8429568, grid=grid(8429568), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_mul_sigmoid_silu_108.benchmark_all_configs(*args, 8429568, grid=grid(8429568)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/am/camchutakzc7ufjmewydlcbxzwspii4jktvvytxzvdtkijmunoki.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_140 | |
| triton_poi_fused__to_copy_109 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_109(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 34944 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((104, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((104, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_109.run(*args, 34944, grid=grid(34944), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_109.benchmark_all_configs(*args, 34944, grid=grid(34944)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/yx/cyxdjlki3rk4s5ktbskifaxqivcqtfxhzou2fgeig544h2budbbn.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => convert_element_type_141, var_mean_23 | |
| triton_red_fused__native_batch_norm_legit_functional_110 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[512, 8192], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_110(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 416 | |
| rnumel = 6272 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex % 104 | |
| x1 = (xindex // 104) | |
| _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| x3 = xindex | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| tmp0 = tl.load(in_ptr0 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
| tmp2 = tl.sum(_tmp2, 1)[:, None] | |
| tl.store(out_ptr0 + x3, tmp2, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 104, 1, 1, 4), (416, 1, 416, 416, 104), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_110.run(*args, 416, 6272, grid=grid(416), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_110.benchmark_all_configs(*args, 416, 6272, grid=grid(416)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ns/cnsyz4u2t4xdpizytxz7kuybbnglpm544k5kmwi5e6mw37eoeeyp.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => add_122, convert_element_type_141, mul_182, mul_183, var_mean_23 | |
| triton_per_fused__native_batch_norm_legit_functional_111 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[128, 4], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__native_batch_norm_legit_functional_111(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 104 | |
| rnumel = 4 | |
| RBLOCK: tl.constexpr = 4 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (104*r1)), rmask & xmask, other=0) | |
| tmp8 = tl.load(in_ptr1 + (x0), xmask) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp4 = 25088.0 | |
| tmp5 = tmp3 / tmp4 | |
| tmp6 = 0.1 | |
| tmp7 = tmp5 * tmp6 | |
| tmp9 = 0.9 | |
| tmp10 = tmp8 * tmp9 | |
| tmp11 = tmp7 + tmp10 | |
| tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((1, 104, 1, 1, 4), (416, 1, 416, 416, 104), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__native_batch_norm_legit_functional_111.run(*args, 104, 4, grid=grid(104), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__native_batch_norm_legit_functional_111.benchmark_all_configs(*args, 104, 4, grid=grid(104)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/vx/cvx6izhz5yqxyjo6cfzkguw47brvifvvxn3jp6fmmqpamdr6rdo7.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => convert_element_type_141, var_mean_23 | |
| triton_red_fused__native_batch_norm_legit_functional_112 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[512, 8192], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_112(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 416 | |
| rnumel = 6272 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex % 104 | |
| x1 = (xindex // 104) | |
| tmp2 = tl.load(in_ptr1 + (x0), xmask) | |
| _tmp5 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| x3 = xindex | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| tmp0 = tl.load(in_ptr0 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp4 = tmp3 * tmp3 | |
| _tmp5 = tl.where(rmask & xmask, _tmp5 + tmp4, _tmp5) | |
| tmp5 = tl.sum(_tmp5, 1)[:, None] | |
| tl.store(out_ptr0 + x3, tmp5, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 104, 1, 1, 4), (416, 1, 416, 416, 104), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_112.run(*args, 416, 6272, grid=grid(416), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_112.benchmark_all_configs(*args, 416, 6272, grid=grid(416)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/cy/ccy7dhumtl6nx7vjrgetby6qbjqdcsh4fen2e264olgvvilit6pl.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => add_121, add_123, convert_element_type_141, mul_184, mul_185, mul_186, rsqrt_23, squeeze_70, var_mean_23 | |
| triton_per_fused__native_batch_norm_legit_functional_113 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[128, 4], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__native_batch_norm_legit_functional_113(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 104 | |
| rnumel = 4 | |
| RBLOCK: tl.constexpr = 4 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (104*r1)), rmask & xmask, other=0) | |
| tmp13 = tl.load(in_ptr1 + (x0), xmask) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp4 = 25088.0 | |
| tmp5 = tmp3 / tmp4 | |
| tmp6 = 1e-05 | |
| tmp7 = tmp5 + tmp6 | |
| tmp8 = tl.math.rsqrt(tmp7) | |
| tmp9 = 1.0000398612827361 | |
| tmp10 = tmp5 * tmp9 | |
| tmp11 = 0.1 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = 0.9 | |
| tmp15 = tmp13 * tmp14 | |
| tmp16 = tmp12 + tmp15 | |
| tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask) | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask) | |
| tl.store(out_ptr0 + x0, tmp3, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((1, 104, 1, 1, 4), (416, 1, 416, 416, 104), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__native_batch_norm_legit_functional_113.run(*args, 104, 4, grid=grid(104), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__native_batch_norm_legit_functional_113.benchmark_all_configs(*args, 104, 4, grid=grid(104)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/xg/cxgqusw3xzpsm5ih6qyxh7qdzmvcnjj2evihvlzncrorcyi7altv.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => add_121, add_124, convert_element_type_141, convert_element_type_142, mul_181, mul_187, rsqrt_23, sub_23, var_mean_23 | |
| triton_poi_fused__native_batch_norm_legit_functional_114 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_114(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2609152 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 196) % 104 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp2 = tl.load(in_ptr1 + (x1), None) | |
| tmp4 = tl.load(in_ptr2 + (x1), None) | |
| tmp11 = tl.load(in_ptr3 + (x1), None) | |
| tmp13 = tl.load(in_ptr4 + (x1), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp5 = 25088.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = 1e-05 | |
| tmp8 = tmp6 + tmp7 | |
| tmp9 = tl.math.rsqrt(tmp8) | |
| tmp10 = tmp3 * tmp9 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = tmp12 + tmp13 | |
| tmp15 = tmp14.to(tl.float32) | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_114.run(*args, 2609152, grid=grid(2609152), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_114.benchmark_all_configs(*args, 2609152, grid=grid(2609152)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/hr/chrygikuwep25dgozsipotfnih6lajyye2txuo4r7put7ywkdk3t.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_143 | |
| triton_poi_fused__to_copy_115 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_115(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 16224 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_115.run(*args, 16224, grid=grid(16224), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_115.benchmark_all_configs(*args, 16224, grid=grid(16224)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ca/ccaahgmwlqiraz4ivxpktecmbqhvntosz4cdfe5ghfjm4pe6pws6.py | |
| # Original ATen: aten.cat | |
| # aten.cat => cat_16 | |
| triton_poi_fused_cat_116 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_cat_116(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 7827456 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 61152 | |
| x1 = (xindex // 61152) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + (122304*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 312, 14, 14), (61152, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 312, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_cat_116.run(*args, 7827456, grid=grid(7827456), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_cat_116.benchmark_all_configs(*args, 7827456, grid=grid(7827456)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/j3/cj3tf4apq2n25yzonidfbwyd3cn3jiyqodrw5hjo73xovhn5czvu.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => add_126, add_127, add_128, convert_element_type_145, mul_189, mul_190, mul_191, mul_192, mul_193, rsqrt_24, squeeze_73, var_mean_24 | |
| triton_red_fused__native_batch_norm_legit_functional_117 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[1024, 32768], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_117(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 624 | |
| rnumel = 25088 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex | |
| _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 196 | |
| r2 = (rindex // 196) | |
| tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
| tmp2 = tl.sum(_tmp2, 1)[:, None] | |
| tmp7 = tl.load(in_ptr1 + (x0), xmask) | |
| tmp3 = 25088.0 | |
| tmp4 = tmp2 / tmp3 | |
| tmp5 = 0.1 | |
| tmp6 = tmp4 * tmp5 | |
| tmp8 = 0.9 | |
| tmp9 = tmp7 * tmp8 | |
| tmp10 = tmp6 + tmp9 | |
| tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask) | |
| _tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 196 | |
| r2 = (rindex // 196) | |
| tmp11 = tl.load(in_ptr0 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp12 = tmp11.to(tl.float32) | |
| tmp13 = tmp12 - tmp4 | |
| tmp14 = tmp13 * tmp13 | |
| _tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15) | |
| tmp15 = tl.sum(_tmp15, 1)[:, None] | |
| tl.store(out_ptr1 + x0, tmp15, xmask) | |
| tmp25 = tl.load(in_ptr2 + (x0), xmask) | |
| tmp16 = 25088.0 | |
| tmp17 = tmp15 / tmp16 | |
| tmp18 = 1e-05 | |
| tmp19 = tmp17 + tmp18 | |
| tmp20 = tl.math.rsqrt(tmp19) | |
| tmp21 = 1.0000398612827361 | |
| tmp22 = tmp17 * tmp21 | |
| tmp23 = 0.1 | |
| tmp24 = tmp22 * tmp23 | |
| tmp26 = 0.9 | |
| tmp27 = tmp25 * tmp26 | |
| tmp28 = tmp24 + tmp27 | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask) | |
| tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_117.run(*args, 624, 25088, grid=grid(624), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_117.benchmark_all_configs(*args, 624, 25088, grid=grid(624)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/yk/cyk2njvgthyqkaax3hrogqqjnq7t4z4h2bdza6wc2nyqhlzwiqur.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub | |
| # aten._native_batch_norm_legit_functional => add_126, add_129, convert_element_type_145, convert_element_type_146, mul_188, mul_194, rsqrt_24, sub_24, var_mean_24 | |
| # aten.add => add_355 | |
| # aten.clone => clone_15 | |
| # aten.fill => full_like_32 | |
| # aten.mul => mul_907, mul_908 | |
| # aten.sigmoid => sigmoid_96 | |
| # aten.sub => sub_233 | |
| triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_118 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_118(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 15654912 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 196) % 624 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp2 = tl.load(in_ptr1 + (x1), None) | |
| tmp4 = tl.load(in_ptr2 + (x1), None) | |
| tmp11 = tl.load(in_ptr3 + (x1), None) | |
| tmp13 = tl.load(in_ptr4 + (x1), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp5 = 25088.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = 1e-05 | |
| tmp8 = tmp6 + tmp7 | |
| tmp9 = tl.math.rsqrt(tmp8) | |
| tmp10 = tmp3 * tmp9 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = tmp12 + tmp13 | |
| tmp15 = tmp14.to(tl.float32) | |
| tmp16 = tl.sigmoid(tmp15) | |
| tmp17 = 1.0 | |
| tmp18 = tmp17 - tmp16 | |
| tmp19 = tmp15 * tmp18 | |
| tmp20 = tmp19 + tmp17 | |
| tmp21 = tmp16 * tmp20 | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None) | |
| tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp21, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_6 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_118.run(*args, 15654912, grid=grid(15654912), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_118.benchmark_all_configs(*args, 15654912, grid=grid(15654912)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/h4/ch4fzflvelcyupsdemcjv4coogzqeqjolfo3rcegkby4xe2z7cgp.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_149 | |
| triton_poi_fused__to_copy_119 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_119(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1404 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((156, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((156, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_119.run(*args, 1404, grid=grid(1404), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_119.benchmark_all_configs(*args, 1404, grid=grid(1404)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ee/ceehri7sldlkmfyc53ikeryr64g77qkjawr2ddxdwntxstm4glbe.py | |
| # Original ATen: aten.split_with_sizes | |
| # aten.split_with_sizes => getitem_146 | |
| triton_poi_fused_split_with_sizes_120 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_split_with_sizes_120(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3913728 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 30576 | |
| x1 = (xindex // 30576) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (122304*x1)), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_split_with_sizes_120.run(*args, 3913728, grid=grid(3913728), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_split_with_sizes_120.benchmark_all_configs(*args, 3913728, grid=grid(3913728)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/3q/c3qy34jaqk4nnzsuou7s5dwcbuoi5kctwalhcspkfthqj2nvhjzq.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_150 | |
| triton_poi_fused__to_copy_121 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_121(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3900 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((156, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((156, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_121.run(*args, 3900, grid=grid(3900), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_121.benchmark_all_configs(*args, 3900, grid=grid(3900)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/d2/cd2noiyayhlfce4vrflvnr5n745imwlcczcywnnle3qtso27avlv.py | |
| # Original ATen: aten.split_with_sizes | |
| # aten.split_with_sizes => getitem_151 | |
| triton_poi_fused_split_with_sizes_122 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_split_with_sizes_122(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3913728 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 30576 | |
| x1 = (xindex // 30576) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (30576 + x0 + (122304*x1)), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_split_with_sizes_122.run(*args, 3913728, grid=grid(3913728), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_split_with_sizes_122.benchmark_all_configs(*args, 3913728, grid=grid(3913728)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/on/conrh3mxhtsmg5nvtksxdme55y6bhfvwrrnpp6mhmwmulosn5uyv.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_151 | |
| triton_poi_fused__to_copy_123 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_123(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 7644 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((156, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((156, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_123.run(*args, 7644, grid=grid(7644), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_123.benchmark_all_configs(*args, 7644, grid=grid(7644)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/nc/cncwy3tqtyzgd3muyfflcfni7qls4ttqyb4sac6cmg6znll2bcqu.py | |
| # Original ATen: aten.split_with_sizes | |
| # aten.split_with_sizes => getitem_156 | |
| triton_poi_fused_split_with_sizes_124 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_split_with_sizes_124(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3913728 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 30576 | |
| x1 = (xindex // 30576) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (61152 + x0 + (122304*x1)), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_split_with_sizes_124.run(*args, 3913728, grid=grid(3913728), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_split_with_sizes_124.benchmark_all_configs(*args, 3913728, grid=grid(3913728)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/yy/cyyxm3mybqiv5z5gofz4uhjttc6in5hwbrcbzt4xlp4z5lqyiqvr.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_152 | |
| triton_poi_fused__to_copy_125 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_125(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 12636 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((156, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((156, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_125.run(*args, 12636, grid=grid(12636), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_125.benchmark_all_configs(*args, 12636, grid=grid(12636)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ta/ctar2nhvp2hnx6l3xel6gxwicmmw5nlhbt33l63xfadahaihh7qo.py | |
| # Original ATen: aten.split_with_sizes | |
| # aten.split_with_sizes => getitem_161 | |
| triton_poi_fused_split_with_sizes_126 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_split_with_sizes_126(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3913728 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 30576 | |
| x1 = (xindex // 30576) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (91728 + x0 + (122304*x1)), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_split_with_sizes_126.run(*args, 3913728, grid=grid(3913728), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_split_with_sizes_126.benchmark_all_configs(*args, 3913728, grid=grid(3913728)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/aa/caaej5ipewhg7u3hng5mw3omnmn72k6eam7aoq4mbk5yqdeq7wwb.py | |
| # Original ATen: aten.cat | |
| # aten.cat => cat_17 | |
| triton_poi_fused_cat_127 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_cat_127(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3913728 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 30576 | |
| x1 = (xindex // 30576) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + (122304*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 156, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_cat_127.run(*args, 3913728, grid=grid(3913728), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_cat_127.benchmark_all_configs(*args, 3913728, grid=grid(3913728)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/xm/cxmjpqggq2cokpfycochmecokolhxsamegk6vtmem3x47oz5ss3r.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu | |
| # aten._native_batch_norm_legit_functional => add_131, add_134, convert_element_type_153, convert_element_type_154, mul_196, mul_202, rsqrt_25, sub_25, var_mean_25 | |
| # aten.mean => mean_5 | |
| # aten.silu => convert_element_type_155, convert_element_type_156, mul_203, sigmoid_21 | |
| triton_per_fused__native_batch_norm_legit_functional_mean_silu_128 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[131072, 256], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__native_batch_norm_legit_functional_mean_silu_128(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 79872 | |
| rnumel = 196 | |
| RBLOCK: tl.constexpr = 256 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| x3 = xindex | |
| x0 = xindex % 624 | |
| tmp0 = tl.load(in_ptr0 + (r2 + (196*x3)), rmask, other=0).to(tl.float32) | |
| tmp2 = tl.load(in_ptr1 + (x0), None) | |
| tmp4 = tl.load(in_ptr2 + (x0), None) | |
| tmp11 = tl.load(in_ptr3 + (x0), None) | |
| tmp13 = tl.load(in_ptr4 + (x0), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp5 = 25088.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = 1e-05 | |
| tmp8 = tmp6 + tmp7 | |
| tmp9 = tl.math.rsqrt(tmp8) | |
| tmp10 = tmp3 * tmp9 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = tmp12 + tmp13 | |
| tmp15 = tmp14.to(tl.float32) | |
| tmp16 = tmp15.to(tl.float32) | |
| tmp17 = tl.sigmoid(tmp16) | |
| tmp18 = tmp16 * tmp17 | |
| tmp19 = tmp18.to(tl.float32) | |
| tmp20 = tmp19.to(tl.float32) | |
| tmp22 = tl.where(rmask, tmp20, 0) | |
| tmp23 = tl.sum(tmp22, 1)[:, None] | |
| tmp24 = 196.0 | |
| tmp25 = tmp23 / tmp24 | |
| tmp26 = tmp25.to(tl.float32) | |
| tl.store(out_ptr0 + (r2 + (196*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask) | |
| tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_6 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__native_batch_norm_legit_functional_mean_silu_128.run(*args, 79872, 196, grid=grid(79872), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__native_batch_norm_legit_functional_mean_silu_128.benchmark_all_configs(*args, 79872, 196, grid=grid(79872)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/5n/c5nuz3wivd2kux327s5ffdifgf3i65ukistqypmzli4ipcs5fhup.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_158 | |
| triton_poi_fused__to_copy_129 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_129(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 16224 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((26, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((26, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_129.run(*args, 16224, grid=grid(16224), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_129.benchmark_all_configs(*args, 16224, grid=grid(16224)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/7u/c7uwxkw3ipfjfdehvzykp4imfsjz53vfoikfvt4ys7uwsknqbbb2.py | |
| # Original ATen: aten._to_copy, aten.convolution | |
| # aten._to_copy => convert_element_type_157 | |
| # aten.convolution => convolution_60 | |
| triton_poi_fused__to_copy_convolution_130 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[32], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_convolution_130(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 26 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((26,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((26,), (1,), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_convolution_130.run(*args, 26, grid=grid(26), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_convolution_130.benchmark_all_configs(*args, 26, grid=grid(26)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/qj/cqjs4oas7ss654pge5hzaweuxapq6ge5y5fvo2ygxf7r6qylld6m.py | |
| # Original ATen: aten._to_copy, aten.convolution, aten.silu | |
| # aten._to_copy => convert_element_type_157 | |
| # aten.convolution => convolution_60 | |
| # aten.silu => convert_element_type_159, convert_element_type_160, mul_204, sigmoid_22 | |
| triton_poi_fused__to_copy_convolution_silu_131 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_convolution_silu_131(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3328 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 26 | |
| tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32) | |
| tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp2 = tmp0 + tmp1 | |
| tmp3 = tmp2.to(tl.float32) | |
| tmp4 = tl.sigmoid(tmp3) | |
| tmp5 = tmp3 * tmp4 | |
| tmp6 = tmp5.to(tl.float32) | |
| tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((26,), (1,), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_convolution_silu_131.run(*args, 3328, grid=grid(3328), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_convolution_silu_131.benchmark_all_configs(*args, 3328, grid=grid(3328)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/af/cafnwdqexuqbr3u7uonjmdp2h6gms65ppr5ersr7levxxzbxsags.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_162 | |
| triton_poi_fused__to_copy_132 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_132(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 16224 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((624, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((624, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_132.run(*args, 16224, grid=grid(16224), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_132.benchmark_all_configs(*args, 16224, grid=grid(16224)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/qt/cqtzzxfgsd6elacosxssusc2qroqxqbum3szv62gh4gr6s5vzwmp.py | |
| # Original ATen: aten._to_copy, aten.convolution | |
| # aten._to_copy => convert_element_type_161 | |
| # aten.convolution => convolution_61 | |
| triton_poi_fused__to_copy_convolution_133 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_convolution_133(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 624 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_convolution_133.run(*args, 624, grid=grid(624), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_convolution_133.benchmark_all_configs(*args, 624, grid=grid(624)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/hd/chd7xmae5qozjrly7porkx7hggrm6fh6klnsko3rzc52ohjqiyou.py | |
| # Original ATen: aten._to_copy, aten.convolution | |
| # aten._to_copy => convert_element_type_161 | |
| # aten.convolution => convolution_61 | |
| triton_poi_fused__to_copy_convolution_134 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_convolution_134(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 79872 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 624 | |
| tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32) | |
| tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32) | |
| tmp2 = tmp0 + tmp1 | |
| tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_convolution_134.run(*args, 79872, grid=grid(79872), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_convolution_134.benchmark_all_configs(*args, 79872, grid=grid(79872)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/mn/cmnvsfbxocmtd6hcxhi5n75ltfyzsnegxzpgv7o57n6ksfhmrkay.py | |
| # Original ATen: aten.mul, aten.sigmoid, aten.silu | |
| # aten.mul => mul_205 | |
| # aten.sigmoid => sigmoid_23 | |
| # aten.silu => convert_element_type_155, convert_element_type_156, mul_203, sigmoid_21 | |
| triton_poi_fused_mul_sigmoid_silu_135 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_mul_sigmoid_silu_135(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 15654912 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x1 = (xindex // 196) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp3.to(tl.float32) | |
| tmp6 = tl.sigmoid(tmp5) | |
| tmp7 = tmp4 * tmp6 | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_mul_sigmoid_silu_135.run(*args, 15654912, grid=grid(15654912), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_mul_sigmoid_silu_135.benchmark_all_configs(*args, 15654912, grid=grid(15654912)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/x6/cx65e4iuztiipxmj3kt6pdmtbqk6eud6jbchpllpc6tinjqjiko7.py | |
| # Original ATen: aten.split_with_sizes | |
| # aten.split_with_sizes => getitem_164 | |
| triton_poi_fused_split_with_sizes_136 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_split_with_sizes_136(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 7827456 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 61152 | |
| x1 = (xindex // 61152) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (122304*x1)), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 312, 14, 14), (61152, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_split_with_sizes_136.run(*args, 7827456, grid=grid(7827456), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_split_with_sizes_136.benchmark_all_configs(*args, 7827456, grid=grid(7827456)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/la/cla6pjsd7zhhqofhjak5ak67vzu4puphrosszybe7he2qayusdf7.py | |
| # Original ATen: aten.split_with_sizes | |
| # aten.split_with_sizes => getitem_165 | |
| triton_poi_fused_split_with_sizes_137 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_split_with_sizes_137(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 7827456 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 61152 | |
| x1 = (xindex // 61152) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (61152 + x0 + (122304*x1)), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 312, 14, 14), (61152, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_split_with_sizes_137.run(*args, 7827456, grid=grid(7827456), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_split_with_sizes_137.benchmark_all_configs(*args, 7827456, grid=grid(7827456)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/hz/chz3fw3o5fqe2cnzuiuggnyrnh3xzhhrw266gsgpwo3xgo4buev6.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_163 | |
| triton_poi_fused__to_copy_138 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_138(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 16224 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_138.run(*args, 16224, grid=grid(16224), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_138.benchmark_all_configs(*args, 16224, grid=grid(16224)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/vk/cvkwpn3gas36vom5boko6m7yjnsu4h3jn3s2dj74g4puyptio4dc.py | |
| # Original ATen: aten.cat | |
| # aten.cat => cat_18 | |
| triton_poi_fused_cat_139 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_cat_139(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1304576 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 10192 | |
| x1 = (xindex // 10192) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + (20384*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 52, 14, 14), (10192, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 52, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_cat_139.run(*args, 1304576, grid=grid(1304576), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_cat_139.benchmark_all_configs(*args, 1304576, grid=grid(1304576)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/qp/cqptss63kr7w22hqbaitci52teddi4haqxu6zws6bzlk7k3r4p5w.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add | |
| # aten._native_batch_norm_legit_functional => add_136, add_139, convert_element_type_165, convert_element_type_166, mul_206, mul_212, rsqrt_26, sub_26, var_mean_26 | |
| # aten.add => add_140 | |
| triton_poi_fused__native_batch_norm_legit_functional_add_140 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_add_140(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2609152 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 196) % 104 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp2 = tl.load(in_ptr1 + (x1), None) | |
| tmp4 = tl.load(in_ptr2 + (x1), None) | |
| tmp11 = tl.load(in_ptr3 + (x1), None) | |
| tmp13 = tl.load(in_ptr4 + (x1), None) | |
| tmp16 = tl.load(in_ptr5 + (x3), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp5 = 25088.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = 1e-05 | |
| tmp8 = tmp6 + tmp7 | |
| tmp9 = tl.math.rsqrt(tmp8) | |
| tmp10 = tmp3 * tmp9 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = tmp12 + tmp13 | |
| tmp15 = tmp14.to(tl.float32) | |
| tmp17 = tmp15 + tmp16 | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp17, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_6 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_add_140.run(*args, 2609152, grid=grid(2609152), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_add_140.benchmark_all_configs(*args, 2609152, grid=grid(2609152)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/zf/czfuu6dquzuudpwuu3bwtsd76hxju2ps43qrqh5npmmaayzjihi2.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_215 | |
| triton_poi_fused__to_copy_141 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_141(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 64896 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((624, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((624, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_141.run(*args, 64896, grid=grid(64896), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_141.benchmark_all_configs(*args, 64896, grid=grid(64896)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/hj/chjvw2ozl3ipjwmejxog5ggykkaa6pfst7j432f3sbhnt2kvth4p.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.silu, aten.sub | |
| # aten._native_batch_norm_legit_functional => add_174, add_177, convert_element_type_216, convert_element_type_217, mul_263, mul_269, rsqrt_33, sub_33, var_mean_33 | |
| # aten.add => add_341 | |
| # aten.clone => clone_24 | |
| # aten.fill => full_like_23 | |
| # aten.mul => mul_787, mul_788 | |
| # aten.sigmoid => sigmoid_87 | |
| # aten.silu => convert_element_type_218, convert_element_type_219, mul_270, sigmoid_32 | |
| # aten.sub => sub_185 | |
| triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_silu_sub_142 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_silu_sub_142(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr1, out_ptr2, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 15654912 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 196) % 624 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp2 = tl.load(in_ptr1 + (x1), None) | |
| tmp4 = tl.load(in_ptr2 + (x1), None) | |
| tmp11 = tl.load(in_ptr3 + (x1), None) | |
| tmp13 = tl.load(in_ptr4 + (x1), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp5 = 25088.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = 1e-05 | |
| tmp8 = tmp6 + tmp7 | |
| tmp9 = tl.math.rsqrt(tmp8) | |
| tmp10 = tmp3 * tmp9 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = tmp12 + tmp13 | |
| tmp15 = tmp14.to(tl.float32) | |
| tmp16 = tmp15.to(tl.float32) | |
| tmp17 = tl.sigmoid(tmp16) | |
| tmp18 = tmp16 * tmp17 | |
| tmp19 = tmp18.to(tl.float32) | |
| tmp20 = tl.sigmoid(tmp15) | |
| tmp21 = 1.0 | |
| tmp22 = tmp21 - tmp20 | |
| tmp23 = tmp15 * tmp22 | |
| tmp24 = tmp23 + tmp21 | |
| tmp25 = tmp20 * tmp24 | |
| tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp19, None) | |
| tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp25, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_6 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_silu_sub_142.run(*args, 15654912, grid=grid(15654912), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_silu_sub_142.benchmark_all_configs(*args, 15654912, grid=grid(15654912)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/by/cbyy7pyyeky2s6spdugmwrwqdvrjfdr7qwcxwapwzhywwhng5ujh.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_220 | |
| triton_poi_fused__to_copy_143 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_143(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 5616 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((624, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((624, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_143.run(*args, 5616, grid=grid(5616), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_143.benchmark_all_configs(*args, 5616, grid=grid(5616)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ji/cjiljcgh2wc2yighvqz76tkwzti3zjjxpno5dng2ammsgvfhwvv7.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_226 | |
| triton_poi_fused__to_copy_144 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_144(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 32448 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((52, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((52, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_144.run(*args, 32448, grid=grid(32448), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_144.benchmark_all_configs(*args, 32448, grid=grid(32448)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/tv/ctvzn4xpxjd5g7oyd7nntvpnuyovmntivt2fyxf53c3mblh6dshr.py | |
| # Original ATen: aten._to_copy, aten.convolution | |
| # aten._to_copy => convert_element_type_225 | |
| # aten.convolution => convolution_86 | |
| triton_poi_fused__to_copy_convolution_145 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[64], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_convolution_145(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 52 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((52,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((52,), (1,), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_convolution_145.run(*args, 52, grid=grid(52), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_convolution_145.benchmark_all_configs(*args, 52, grid=grid(52)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/sf/csfcbn5kzqvvv6krkr7772b7z66pr6xsuxyhtbgglpb26gkfc5h3.py | |
| # Original ATen: aten._to_copy, aten.convolution, aten.silu | |
| # aten._to_copy => convert_element_type_225 | |
| # aten.convolution => convolution_86 | |
| # aten.silu => convert_element_type_227, convert_element_type_228, mul_279, sigmoid_34 | |
| triton_poi_fused__to_copy_convolution_silu_146 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_convolution_silu_146(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 6656 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 52 | |
| tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32) | |
| tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp2 = tmp0 + tmp1 | |
| tmp3 = tmp2.to(tl.float32) | |
| tmp4 = tl.sigmoid(tmp3) | |
| tmp5 = tmp3 * tmp4 | |
| tmp6 = tmp5.to(tl.float32) | |
| tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((52,), (1,), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_convolution_silu_146.run(*args, 6656, grid=grid(6656), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_convolution_silu_146.benchmark_all_configs(*args, 6656, grid=grid(6656)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ik/cikbdagjlpyedk3af4it4ary3yx6g4gpfifz25vxc7r25q224d43.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_230 | |
| triton_poi_fused__to_copy_147 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_147(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 32448 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((624, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((624, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_147.run(*args, 32448, grid=grid(32448), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_147.benchmark_all_configs(*args, 32448, grid=grid(32448)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/xb/cxb5poqew2wpvurnixpesahtvwzfiadakefnqud2hzokvcgnemog.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_231 | |
| triton_poi_fused__to_copy_148 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_148(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 99840 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((160, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((160, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_148.run(*args, 99840, grid=grid(99840), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_148.benchmark_all_configs(*args, 99840, grid=grid(99840)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/z7/cz7tf27fdbr7sfmkjktkou2imnr2mnlt7hicdhvs4chclhrw6j37.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => convert_element_type_232, var_mean_35 | |
| triton_red_fused__native_batch_norm_legit_functional_149 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[1024, 8192], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_149(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 640 | |
| rnumel = 6272 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex % 160 | |
| x1 = (xindex // 160) | |
| _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| x3 = xindex | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| tmp0 = tl.load(in_ptr0 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
| tmp2 = tl.sum(_tmp2, 1)[:, None] | |
| tl.store(out_ptr0 + x3, tmp2, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 160, 1, 1, 4), (640, 1, 640, 640, 160), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_149.run(*args, 640, 6272, grid=grid(640), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_149.benchmark_all_configs(*args, 640, 6272, grid=grid(640)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/hi/chiuub4omrsfimb4p75xk4vlbwv5mhljjinvhtfc3kqhmfnsyq4q.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => add_185, convert_element_type_232, mul_282, mul_283, var_mean_35 | |
| triton_per_fused__native_batch_norm_legit_functional_150 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[256, 4], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__native_batch_norm_legit_functional_150(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 160 | |
| rnumel = 4 | |
| RBLOCK: tl.constexpr = 4 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (160*r1)), rmask & xmask, other=0) | |
| tmp8 = tl.load(in_ptr1 + (x0), xmask) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp4 = 25088.0 | |
| tmp5 = tmp3 / tmp4 | |
| tmp6 = 0.1 | |
| tmp7 = tmp5 * tmp6 | |
| tmp9 = 0.9 | |
| tmp10 = tmp8 * tmp9 | |
| tmp11 = tmp7 + tmp10 | |
| tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((1, 160, 1, 1, 4), (640, 1, 640, 640, 160), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__native_batch_norm_legit_functional_150.run(*args, 160, 4, grid=grid(160), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__native_batch_norm_legit_functional_150.benchmark_all_configs(*args, 160, 4, grid=grid(160)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/4b/c4bvtzglf3eflssbrfazd6mjdgmrhoujx4xq6fve2tfmkwyeseru.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => convert_element_type_232, var_mean_35 | |
| triton_red_fused__native_batch_norm_legit_functional_151 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[1024, 8192], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_151(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 640 | |
| rnumel = 6272 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex % 160 | |
| x1 = (xindex // 160) | |
| tmp2 = tl.load(in_ptr1 + (x0), xmask) | |
| _tmp5 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| x3 = xindex | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| tmp0 = tl.load(in_ptr0 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp4 = tmp3 * tmp3 | |
| _tmp5 = tl.where(rmask & xmask, _tmp5 + tmp4, _tmp5) | |
| tmp5 = tl.sum(_tmp5, 1)[:, None] | |
| tl.store(out_ptr0 + x3, tmp5, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 160, 1, 1, 4), (640, 1, 640, 640, 160), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_151.run(*args, 640, 6272, grid=grid(640), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_151.benchmark_all_configs(*args, 640, 6272, grid=grid(640)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/vo/cvo4hmhbgzgrpmknfykmtsywf2s2dd4n7xkbf6ll6hxoj5qx3zny.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => add_184, add_186, convert_element_type_232, mul_284, mul_285, mul_286, rsqrt_35, squeeze_106, var_mean_35 | |
| triton_per_fused__native_batch_norm_legit_functional_152 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[256, 4], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__native_batch_norm_legit_functional_152(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 160 | |
| rnumel = 4 | |
| RBLOCK: tl.constexpr = 4 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (160*r1)), rmask & xmask, other=0) | |
| tmp13 = tl.load(in_ptr1 + (x0), xmask) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp4 = 25088.0 | |
| tmp5 = tmp3 / tmp4 | |
| tmp6 = 1e-05 | |
| tmp7 = tmp5 + tmp6 | |
| tmp8 = tl.math.rsqrt(tmp7) | |
| tmp9 = 1.0000398612827361 | |
| tmp10 = tmp5 * tmp9 | |
| tmp11 = 0.1 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = 0.9 | |
| tmp15 = tmp13 * tmp14 | |
| tmp16 = tmp12 + tmp15 | |
| tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask) | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask) | |
| tl.store(out_ptr0 + x0, tmp3, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((1, 160, 1, 1, 4), (640, 1, 640, 640, 160), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__native_batch_norm_legit_functional_152.run(*args, 160, 4, grid=grid(160), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__native_batch_norm_legit_functional_152.benchmark_all_configs(*args, 160, 4, grid=grid(160)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/7r/c7rznphkbqtbupvosh2bugs6qz6cx4gcbzlpwzvb44okdkqoc7am.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => add_184, add_187, convert_element_type_232, convert_element_type_233, mul_281, mul_287, rsqrt_35, sub_35, var_mean_35 | |
| triton_poi_fused__native_batch_norm_legit_functional_153 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_153(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 4014080 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 196) % 160 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp2 = tl.load(in_ptr1 + (x1), None) | |
| tmp4 = tl.load(in_ptr2 + (x1), None) | |
| tmp11 = tl.load(in_ptr3 + (x1), None) | |
| tmp13 = tl.load(in_ptr4 + (x1), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp5 = 25088.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = 1e-05 | |
| tmp8 = tmp6 + tmp7 | |
| tmp9 = tl.math.rsqrt(tmp8) | |
| tmp10 = tmp3 * tmp9 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = tmp12 + tmp13 | |
| tmp15 = tmp14.to(tl.float32) | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_153.run(*args, 4014080, grid=grid(4014080), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_153.benchmark_all_configs(*args, 4014080, grid=grid(4014080)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/25/c25hctynyjlhient6ceczvttfs4npfgnfoty5zhk6pn4ckhqm3ka.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_234 | |
| triton_poi_fused__to_copy_154 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_154(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 19200 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_154.run(*args, 19200, grid=grid(19200), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_154.benchmark_all_configs(*args, 19200, grid=grid(19200)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/yz/cyzjinfa2nwvymmtkqoovwkfypkotg6ek2sunwsuf3vfd35r7l3k.py | |
| # Original ATen: aten.cat | |
| # aten.cat => cat_25 | |
| triton_poi_fused_cat_155 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_cat_155(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 6021120 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 47040 | |
| x1 = (xindex // 47040) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + (94080*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 240, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_cat_155.run(*args, 6021120, grid=grid(6021120), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_cat_155.benchmark_all_configs(*args, 6021120, grid=grid(6021120)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ks/cksikkrsggj4vc4g42rphhvyb3ox3tanqawsqpy2rpo5qbqc3tin.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => add_189, add_190, add_191, convert_element_type_236, mul_289, mul_290, mul_291, mul_292, mul_293, rsqrt_36, squeeze_109, var_mean_36 | |
| triton_red_fused__native_batch_norm_legit_functional_156 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[512, 32768], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_156(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 480 | |
| rnumel = 25088 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex | |
| _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 196 | |
| r2 = (rindex // 196) | |
| tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
| tmp2 = tl.sum(_tmp2, 1)[:, None] | |
| tmp7 = tl.load(in_ptr1 + (x0), xmask) | |
| tmp3 = 25088.0 | |
| tmp4 = tmp2 / tmp3 | |
| tmp5 = 0.1 | |
| tmp6 = tmp4 * tmp5 | |
| tmp8 = 0.9 | |
| tmp9 = tmp7 * tmp8 | |
| tmp10 = tmp6 + tmp9 | |
| tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask) | |
| _tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 196 | |
| r2 = (rindex // 196) | |
| tmp11 = tl.load(in_ptr0 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp12 = tmp11.to(tl.float32) | |
| tmp13 = tmp12 - tmp4 | |
| tmp14 = tmp13 * tmp13 | |
| _tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15) | |
| tmp15 = tl.sum(_tmp15, 1)[:, None] | |
| tl.store(out_ptr1 + x0, tmp15, xmask) | |
| tmp25 = tl.load(in_ptr2 + (x0), xmask) | |
| tmp16 = 25088.0 | |
| tmp17 = tmp15 / tmp16 | |
| tmp18 = 1e-05 | |
| tmp19 = tmp17 + tmp18 | |
| tmp20 = tl.math.rsqrt(tmp19) | |
| tmp21 = 1.0000398612827361 | |
| tmp22 = tmp17 * tmp21 | |
| tmp23 = 0.1 | |
| tmp24 = tmp22 * tmp23 | |
| tmp26 = 0.9 | |
| tmp27 = tmp25 * tmp26 | |
| tmp28 = tmp24 + tmp27 | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask) | |
| tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_156.run(*args, 480, 25088, grid=grid(480), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_156.benchmark_all_configs(*args, 480, 25088, grid=grid(480)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/uq/cuq6a4x4agmkmfpm3znliwswlpgc26f3bnjc5ylnl5l6aeuugh6f.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub | |
| # aten._native_batch_norm_legit_functional => add_189, add_192, convert_element_type_236, convert_element_type_237, mul_288, mul_294, rsqrt_36, sub_36, var_mean_36 | |
| # aten.add => add_336 | |
| # aten.clone => clone_27 | |
| # aten.fill => full_like_20 | |
| # aten.mul => mul_747, mul_748 | |
| # aten.sigmoid => sigmoid_84 | |
| # aten.sub => sub_169 | |
| triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_157 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_157(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 12042240 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 196) % 480 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp2 = tl.load(in_ptr1 + (x1), None) | |
| tmp4 = tl.load(in_ptr2 + (x1), None) | |
| tmp11 = tl.load(in_ptr3 + (x1), None) | |
| tmp13 = tl.load(in_ptr4 + (x1), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp5 = 25088.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = 1e-05 | |
| tmp8 = tmp6 + tmp7 | |
| tmp9 = tl.math.rsqrt(tmp8) | |
| tmp10 = tmp3 * tmp9 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = tmp12 + tmp13 | |
| tmp15 = tmp14.to(tl.float32) | |
| tmp16 = tl.sigmoid(tmp15) | |
| tmp17 = 1.0 | |
| tmp18 = tmp17 - tmp16 | |
| tmp19 = tmp15 * tmp18 | |
| tmp20 = tmp19 + tmp17 | |
| tmp21 = tmp16 * tmp20 | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None) | |
| tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp21, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_6 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_157.run(*args, 12042240, grid=grid(12042240), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_157.benchmark_all_configs(*args, 12042240, grid=grid(12042240)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/hb/chbvxlmers2jwf43nczkvjub2nxsnhvu5csbgr2u6jwg4rlcdpzl.py | |
| # Original ATen: aten.split_with_sizes | |
| # aten.split_with_sizes => getitem_242 | |
| triton_poi_fused_split_with_sizes_158 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_split_with_sizes_158(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3010560 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 23520 | |
| x1 = (xindex // 23520) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (94080*x1)), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_split_with_sizes_158.run(*args, 3010560, grid=grid(3010560), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_split_with_sizes_158.benchmark_all_configs(*args, 3010560, grid=grid(3010560)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/cu/ccuyxkbjwe7ff4mqowk3q7to3cbjg3o7g2sh6doygm5y2bnlvlfi.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_241 | |
| triton_poi_fused__to_copy_159 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_159(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3000 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_159.run(*args, 3000, grid=grid(3000), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_159.benchmark_all_configs(*args, 3000, grid=grid(3000)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/dg/cdglxs23pt7h2wls62uy6ewinjtjctcradtpw76t3qcrvnhso4kr.py | |
| # Original ATen: aten.split_with_sizes | |
| # aten.split_with_sizes => getitem_247 | |
| triton_poi_fused_split_with_sizes_160 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_split_with_sizes_160(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3010560 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 23520 | |
| x1 = (xindex // 23520) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (23520 + x0 + (94080*x1)), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_split_with_sizes_160.run(*args, 3010560, grid=grid(3010560), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_split_with_sizes_160.benchmark_all_configs(*args, 3010560, grid=grid(3010560)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/v4/cv4uf7ojdb6edtvnqbvbxjvi624yvygrldwzdl6uy2gdzz3sghqp.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_242 | |
| triton_poi_fused__to_copy_161 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_161(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 5880 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_161.run(*args, 5880, grid=grid(5880), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_161.benchmark_all_configs(*args, 5880, grid=grid(5880)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/bn/cbnnjypiavc2wmccrce2li6mllqpp2n4naaetozpnboypgpizsqn.py | |
| # Original ATen: aten.split_with_sizes | |
| # aten.split_with_sizes => getitem_252 | |
| triton_poi_fused_split_with_sizes_162 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_split_with_sizes_162(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3010560 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 23520 | |
| x1 = (xindex // 23520) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (47040 + x0 + (94080*x1)), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_split_with_sizes_162.run(*args, 3010560, grid=grid(3010560), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_split_with_sizes_162.benchmark_all_configs(*args, 3010560, grid=grid(3010560)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/4m/c4mgwvuur32h35cj53cazh6flbatygb5baks32qmyo4zqc7drw5k.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_243 | |
| triton_poi_fused__to_copy_163 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_163(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 9720 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_163.run(*args, 9720, grid=grid(9720), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_163.benchmark_all_configs(*args, 9720, grid=grid(9720)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/4m/c4mepa4uirogjqorspxpl4t5fx2rzmjdzatpwzpg64qrcyuvg7m4.py | |
| # Original ATen: aten.split_with_sizes | |
| # aten.split_with_sizes => getitem_257 | |
| triton_poi_fused_split_with_sizes_164 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_split_with_sizes_164(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3010560 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 23520 | |
| x1 = (xindex // 23520) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (70560 + x0 + (94080*x1)), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_split_with_sizes_164.run(*args, 3010560, grid=grid(3010560), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_split_with_sizes_164.benchmark_all_configs(*args, 3010560, grid=grid(3010560)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/qv/cqvo24surldsdqwccrhisp4chcnkg2otiq5qykrtymzijwmytnb7.py | |
| # Original ATen: aten.cat | |
| # aten.cat => cat_26 | |
| triton_poi_fused_cat_165 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_cat_165(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3010560 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 23520 | |
| x1 = (xindex // 23520) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + (94080*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 120, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_cat_165.run(*args, 3010560, grid=grid(3010560), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_cat_165.benchmark_all_configs(*args, 3010560, grid=grid(3010560)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/25/c25wuycbavajx7kwcvwpcba6kxp6j5oofnjkbplrrxckdmxy3shz.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu | |
| # aten._native_batch_norm_legit_functional => add_194, add_197, convert_element_type_244, convert_element_type_245, mul_296, mul_302, rsqrt_37, sub_37, var_mean_37 | |
| # aten.mean => mean_9 | |
| # aten.silu => convert_element_type_246, convert_element_type_247, mul_303, sigmoid_37 | |
| triton_per_fused__native_batch_norm_legit_functional_mean_silu_166 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[65536, 256], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__native_batch_norm_legit_functional_mean_silu_166(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 61440 | |
| rnumel = 196 | |
| RBLOCK: tl.constexpr = 256 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| x3 = xindex | |
| x0 = xindex % 480 | |
| tmp0 = tl.load(in_ptr0 + (r2 + (196*x3)), rmask, other=0).to(tl.float32) | |
| tmp2 = tl.load(in_ptr1 + (x0), None) | |
| tmp4 = tl.load(in_ptr2 + (x0), None) | |
| tmp11 = tl.load(in_ptr3 + (x0), None) | |
| tmp13 = tl.load(in_ptr4 + (x0), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp5 = 25088.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = 1e-05 | |
| tmp8 = tmp6 + tmp7 | |
| tmp9 = tl.math.rsqrt(tmp8) | |
| tmp10 = tmp3 * tmp9 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = tmp12 + tmp13 | |
| tmp15 = tmp14.to(tl.float32) | |
| tmp16 = tmp15.to(tl.float32) | |
| tmp17 = tl.sigmoid(tmp16) | |
| tmp18 = tmp16 * tmp17 | |
| tmp19 = tmp18.to(tl.float32) | |
| tmp20 = tmp19.to(tl.float32) | |
| tmp22 = tl.where(rmask, tmp20, 0) | |
| tmp23 = tl.sum(tmp22, 1)[:, None] | |
| tmp24 = 196.0 | |
| tmp25 = tmp23 / tmp24 | |
| tmp26 = tmp25.to(tl.float32) | |
| tl.store(out_ptr0 + (r2 + (196*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask) | |
| tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_6 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__native_batch_norm_legit_functional_mean_silu_166.run(*args, 61440, 196, grid=grid(61440), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__native_batch_norm_legit_functional_mean_silu_166.benchmark_all_configs(*args, 61440, 196, grid=grid(61440)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/vh/cvhn4ymz4o7zw5xdmhfhtnvjkm7lfirplpypnuvceox4nfhcy7wo.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_249 | |
| triton_poi_fused__to_copy_167 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_167(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 38400 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_167.run(*args, 38400, grid=grid(38400), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_167.benchmark_all_configs(*args, 38400, grid=grid(38400)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/uc/cuc5yc2wu6mlfc426p45ysrpjdvcf67n4fnoghr4zijaa4qc4jub.py | |
| # Original ATen: aten._to_copy, aten.convolution | |
| # aten._to_copy => convert_element_type_248 | |
| # aten.convolution => convolution_95 | |
| triton_poi_fused__to_copy_convolution_168 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[128], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_convolution_168(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 80 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((80,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((80,), (1,), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_convolution_168.run(*args, 80, grid=grid(80), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_convolution_168.benchmark_all_configs(*args, 80, grid=grid(80)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/w4/cw4m6y7egnkzpotuw7yanoiegenydq7w76ehn3uqkz73dfsxrsvq.py | |
| # Original ATen: aten._to_copy, aten.convolution, aten.silu | |
| # aten._to_copy => convert_element_type_248 | |
| # aten.convolution => convolution_95 | |
| # aten.silu => convert_element_type_250, convert_element_type_251, mul_304, sigmoid_38 | |
| triton_poi_fused__to_copy_convolution_silu_169 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_convolution_silu_169(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 10240 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 80 | |
| tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32) | |
| tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32) | |
| tmp2 = tmp0 + tmp1 | |
| tmp3 = tmp2.to(tl.float32) | |
| tmp4 = tl.sigmoid(tmp3) | |
| tmp5 = tmp3 * tmp4 | |
| tmp6 = tmp5.to(tl.float32) | |
| tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((80,), (1,), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_convolution_silu_169.run(*args, 10240, grid=grid(10240), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_convolution_silu_169.benchmark_all_configs(*args, 10240, grid=grid(10240)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/5w/c5wu2byqoqpdpilokyxdylwqvzl3vlutvi3qed6ouxeny7v3ma4k.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_253 | |
| triton_poi_fused__to_copy_170 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_170(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 38400 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_170.run(*args, 38400, grid=grid(38400), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_170.benchmark_all_configs(*args, 38400, grid=grid(38400)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/t6/ct65ncuugfm3kp7u2z2xcftkozxdb7egjj3feed7rrbxlnc25mxn.py | |
| # Original ATen: aten._to_copy, aten.convolution | |
| # aten._to_copy => convert_element_type_252 | |
| # aten.convolution => convolution_96 | |
| triton_poi_fused__to_copy_convolution_171 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[512], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_convolution_171(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 480 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_convolution_171.run(*args, 480, grid=grid(480), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_convolution_171.benchmark_all_configs(*args, 480, grid=grid(480)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/q3/cq36nnpdlb4xwdvwsiktoah4r62ugwqlfeikgt4twhaobbsdqnku.py | |
| # Original ATen: aten._to_copy, aten.convolution | |
| # aten._to_copy => convert_element_type_252 | |
| # aten.convolution => convolution_96 | |
| triton_poi_fused__to_copy_convolution_172 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_convolution_172(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 61440 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 480 | |
| tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32) | |
| tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32) | |
| tmp2 = tmp0 + tmp1 | |
| tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_convolution_172.run(*args, 61440, grid=grid(61440), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_convolution_172.benchmark_all_configs(*args, 61440, grid=grid(61440)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ce/cce2z44226346eg4knvdzt24sfsujmrpjne4vcofj37miaugwnek.py | |
| # Original ATen: aten.mul, aten.sigmoid, aten.silu | |
| # aten.mul => mul_305 | |
| # aten.sigmoid => sigmoid_39 | |
| # aten.silu => convert_element_type_246, convert_element_type_247, mul_303, sigmoid_37 | |
| triton_poi_fused_mul_sigmoid_silu_173 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_mul_sigmoid_silu_173(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 12042240 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x1 = (xindex // 196) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp3.to(tl.float32) | |
| tmp6 = tl.sigmoid(tmp5) | |
| tmp7 = tmp4 * tmp6 | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_mul_sigmoid_silu_173.run(*args, 12042240, grid=grid(12042240), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_mul_sigmoid_silu_173.benchmark_all_configs(*args, 12042240, grid=grid(12042240)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/jt/cjtarvyynr3tkfvm37qgtcwaksnjyb62otlxal5l5thrlfpz4yoz.py | |
| # Original ATen: aten.split_with_sizes | |
| # aten.split_with_sizes => getitem_260 | |
| triton_poi_fused_split_with_sizes_174 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_split_with_sizes_174(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 6021120 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 47040 | |
| x1 = (xindex // 47040) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (94080*x1)), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_split_with_sizes_174.run(*args, 6021120, grid=grid(6021120), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_split_with_sizes_174.benchmark_all_configs(*args, 6021120, grid=grid(6021120)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/zb/czbuvo3mdbqgu4vhbh2rqqgxj7l4q2oef5prb5xafk4aclfa4wgv.py | |
| # Original ATen: aten.split_with_sizes | |
| # aten.split_with_sizes => getitem_261 | |
| triton_poi_fused_split_with_sizes_175 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_split_with_sizes_175(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 6021120 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 47040 | |
| x1 = (xindex // 47040) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (47040 + x0 + (94080*x1)), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_split_with_sizes_175.run(*args, 6021120, grid=grid(6021120), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_split_with_sizes_175.benchmark_all_configs(*args, 6021120, grid=grid(6021120)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/fo/cfobovxlpvp6vyrqqsfdz6jlgrpfwpemctlxrrgsykw3zih3en3k.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_254 | |
| triton_poi_fused__to_copy_176 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_176(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 19200 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_176.run(*args, 19200, grid=grid(19200), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_176.benchmark_all_configs(*args, 19200, grid=grid(19200)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/66/c66wd4retmi6azmfzfxmll5pmv3btijpcst56cykpuynmmhzhq23.py | |
| # Original ATen: aten.cat | |
| # aten.cat => cat_27 | |
| triton_poi_fused_cat_177 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_cat_177(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2007040 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 15680 | |
| x1 = (xindex // 15680) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + (31360*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 80, 14, 14), (15680, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 80, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_cat_177.run(*args, 2007040, grid=grid(2007040), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_cat_177.benchmark_all_configs(*args, 2007040, grid=grid(2007040)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/4x/c4xzenlwjgchxhkagftgwbnakluheaf3fbwuqeozpqdbo7czlfp4.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add | |
| # aten._native_batch_norm_legit_functional => add_199, add_202, convert_element_type_256, convert_element_type_257, mul_306, mul_312, rsqrt_38, sub_38, var_mean_38 | |
| # aten.add => add_203 | |
| triton_poi_fused__native_batch_norm_legit_functional_add_178 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_add_178(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 4014080 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 196) % 160 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp2 = tl.load(in_ptr1 + (x1), None) | |
| tmp4 = tl.load(in_ptr2 + (x1), None) | |
| tmp11 = tl.load(in_ptr3 + (x1), None) | |
| tmp13 = tl.load(in_ptr4 + (x1), None) | |
| tmp16 = tl.load(in_ptr5 + (x3), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp5 = 25088.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = 1e-05 | |
| tmp8 = tmp6 + tmp7 | |
| tmp9 = tl.math.rsqrt(tmp8) | |
| tmp10 = tmp3 * tmp9 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = tmp12 + tmp13 | |
| tmp15 = tmp14.to(tl.float32) | |
| tmp17 = tmp15 + tmp16 | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp17, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_6 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_add_178.run(*args, 4014080, grid=grid(4014080), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_add_178.benchmark_all_configs(*args, 4014080, grid=grid(4014080)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/xt/cxt5mn4i2rohwchtz75fbvbykzhjdnjrf3qopbwsyyqyu5jqlzr6.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_306 | |
| triton_poi_fused__to_copy_179 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_179(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 153600 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None) | |
| def get_args(): | |
| arg_0 = rand_strided((960, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((960, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_179.run(*args, 153600, grid=grid(153600), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_179.benchmark_all_configs(*args, 153600, grid=grid(153600)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/g5/cg527ugqaap3ndd26uaeekojscsqxex2xmpgnu55nr6zwyzt6pfr.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => add_237, add_238, add_239, convert_element_type_307, mul_364, mul_365, mul_366, mul_367, mul_368, rsqrt_45, squeeze_136, var_mean_45 | |
| triton_red_fused__native_batch_norm_legit_functional_180 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[1024, 32768], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_180(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 960 | |
| rnumel = 25088 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex | |
| _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 196 | |
| r2 = (rindex // 196) | |
| tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
| tmp2 = tl.sum(_tmp2, 1)[:, None] | |
| tmp7 = tl.load(in_ptr1 + (x0), xmask) | |
| tmp3 = 25088.0 | |
| tmp4 = tmp2 / tmp3 | |
| tmp5 = 0.1 | |
| tmp6 = tmp4 * tmp5 | |
| tmp8 = 0.9 | |
| tmp9 = tmp7 * tmp8 | |
| tmp10 = tmp6 + tmp9 | |
| tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask) | |
| _tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 196 | |
| r2 = (rindex // 196) | |
| tmp11 = tl.load(in_ptr0 + (r1 + (196*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp12 = tmp11.to(tl.float32) | |
| tmp13 = tmp12 - tmp4 | |
| tmp14 = tmp13 * tmp13 | |
| _tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15) | |
| tmp15 = tl.sum(_tmp15, 1)[:, None] | |
| tl.store(out_ptr1 + x0, tmp15, xmask) | |
| tmp25 = tl.load(in_ptr2 + (x0), xmask) | |
| tmp16 = 25088.0 | |
| tmp17 = tmp15 / tmp16 | |
| tmp18 = 1e-05 | |
| tmp19 = tmp17 + tmp18 | |
| tmp20 = tl.math.rsqrt(tmp19) | |
| tmp21 = 1.0000398612827361 | |
| tmp22 = tmp17 * tmp21 | |
| tmp23 = 0.1 | |
| tmp24 = tmp22 * tmp23 | |
| tmp26 = 0.9 | |
| tmp27 = tmp25 * tmp26 | |
| tmp28 = tmp24 + tmp27 | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask) | |
| tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_180.run(*args, 960, 25088, grid=grid(960), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_180.benchmark_all_configs(*args, 960, 25088, grid=grid(960)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/dy/cdyo4fykv4n564s3r3eahdxt2kdz67oeovon5bt5bw2lnq6iiowx.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub | |
| # aten._native_batch_norm_legit_functional => add_237, add_240, convert_element_type_307, convert_element_type_308, mul_363, mul_369, rsqrt_45, sub_45, var_mean_45 | |
| # aten.add => add_322 | |
| # aten.clone => clone_36 | |
| # aten.fill => full_like_11 | |
| # aten.mul => mul_627, mul_628 | |
| # aten.sigmoid => sigmoid_75 | |
| # aten.sub => sub_121 | |
| triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_181 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_181(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 24084480 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 196) % 960 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp2 = tl.load(in_ptr1 + (x1), None) | |
| tmp4 = tl.load(in_ptr2 + (x1), None) | |
| tmp11 = tl.load(in_ptr3 + (x1), None) | |
| tmp13 = tl.load(in_ptr4 + (x1), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp1 - tmp2 | |
| tmp5 = 25088.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = 1e-05 | |
| tmp8 = tmp6 + tmp7 | |
| tmp9 = tl.math.rsqrt(tmp8) | |
| tmp10 = tmp3 * tmp9 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = tmp12 + tmp13 | |
| tmp15 = tmp14.to(tl.float32) | |
| tmp16 = tl.sigmoid(tmp15) | |
| tmp17 = 1.0 | |
| tmp18 = tmp17 - tmp16 | |
| tmp19 = tmp15 * tmp18 | |
| tmp20 = tmp19 + tmp17 | |
| tmp21 = tmp16 * tmp20 | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None) | |
| tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp21, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_6 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_181.run(*args, 24084480, grid=grid(24084480), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_181.benchmark_all_configs(*args, 24084480, grid=grid(24084480)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/3n/c3n35ckjp4qebx5kiwvfxztqxmlk7uwm2vsaphijnyencaajb4gh.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_311 | |
| triton_poi_fused__to_copy_182 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_182(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2160 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((240, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((240, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_182.run(*args, 2160, grid=grid(2160), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_182.benchmark_all_configs(*args, 2160, grid=grid(2160)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ow/cowqeeeuzgoqgwjjram7xczyaz465plzun33c7elszbys2k2zrzn.py | |
| # Original ATen: aten.split_with_sizes | |
| # aten.split_with_sizes => getitem_330 | |
| triton_poi_fused_split_with_sizes_183 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_split_with_sizes_183(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 6021120 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 47040 | |
| x1 = (xindex // 47040) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (188160*x1)), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_split_with_sizes_183.run(*args, 6021120, grid=grid(6021120), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_split_with_sizes_183.benchmark_all_configs(*args, 6021120, grid=grid(6021120)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/qx/cqxoadvgndjsqu4bltg3sq7l3fij2isfiihadwqnkb3ibmloxfyz.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_312 | |
| triton_poi_fused__to_copy_184 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_184(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 6000 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((240, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((240, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_184.run(*args, 6000, grid=grid(6000), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_184.benchmark_all_configs(*args, 6000, grid=grid(6000)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/vk/cvkedu5ynhqgwwdblwjx7rst7rz42xs7fl3vaof4iorofkc5cxam.py | |
| # Original ATen: aten.split_with_sizes | |
| # aten.split_with_sizes => getitem_335 | |
| triton_poi_fused_split_with_sizes_185 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_split_with_sizes_185(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 6021120 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 47040 | |
| x1 = (xindex // 47040) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (47040 + x0 + (188160*x1)), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_split_with_sizes_185.run(*args, 6021120, grid=grid(6021120), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_split_with_sizes_185.benchmark_all_configs(*args, 6021120, grid=grid(6021120)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/nv/cnv3es7i7p4naxsxsc56c2w7slsaxha5zh6gcrrgizefpo7zps5s.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_313 | |
| triton_poi_fused__to_copy_186 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_186(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 11760 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((240, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((240, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_186.run(*args, 11760, grid=grid(11760), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_186.benchmark_all_configs(*args, 11760, grid=grid(11760)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/3q/c3qkjjd4hxw6ylyfib2kl5k6u5atp2kcpplda3gzr6acmswhzitj.py | |
| # Original ATen: aten.split_with_sizes | |
| # aten.split_with_sizes => getitem_340 | |
| triton_poi_fused_split_with_sizes_187 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_split_with_sizes_187(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 6021120 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 47040 | |
| x1 = (xindex // 47040) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (94080 + x0 + (188160*x1)), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_split_with_sizes_187.run(*args, 6021120, grid=grid(6021120), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_split_with_sizes_187.benchmark_all_configs(*args, 6021120, grid=grid(6021120)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/l7/cl7v5hh5jck72nsc2u3t2ieraqxarpadxdpy5nt2hkbbqrgcinn7.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_314 | |
| triton_poi_fused__to_copy_188 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_188(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 19440 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((240, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((240, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_188.run(*args, 19440, grid=grid(19440), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_188.benchmark_all_configs(*args, 19440, grid=grid(19440)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/xm/cxm7drxzaido67j4aqf4vqqzkyw7ffgx32ygj3ddi3kmzigoauhg.py | |
| # Original ATen: aten.split_with_sizes | |
| # aten.split_with_sizes => getitem_345 | |
| triton_poi_fused_split_with_sizes_189 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_split_with_sizes_189(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 6021120 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 47040 | |
| x1 = (xindex // 47040) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (141120 + x0 + (188160*x1)), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_split_with_sizes_189.run(*args, 6021120, grid=grid(6021120), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_split_with_sizes_189.benchmark_all_configs(*args, 6021120, grid=grid(6021120)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ny/cnyr36xeex2s2grdckurzoliwfqkdakrsvrmgx5fd6rzjncjxedb.py | |
| # Original ATen: aten.cat | |
| # aten.cat => cat_34 | |
| triton_poi_fused_cat_190 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_cat_190(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1505280 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 11760 | |
| x1 = (xindex // 11760) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + (47040*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 240, 7, 7), (11760, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 240, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_cat_190.run(*args, 1505280, grid=grid(1505280), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_cat_190.benchmark_all_configs(*args, 1505280, grid=grid(1505280)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/z6/cz6ieqk3sfzltam3yeijfwdqrtao7w4zqdsqj32cpu6zoxg3irdq.py | |
| # Original ATen: aten._native_batch_norm_legit_functional | |
| # aten._native_batch_norm_legit_functional => add_242, add_243, add_244, convert_element_type_315, mul_372, mul_373, mul_374, mul_375, mul_376, rsqrt_46, squeeze_139, var_mean_46 | |
| triton_red_fused__native_batch_norm_legit_functional_191 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[1024, 8192], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_191(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 960 | |
| rnumel = 6272 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex | |
| _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 49 | |
| r2 = (rindex // 49) | |
| tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (47040*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
| tmp2 = tl.sum(_tmp2, 1)[:, None] | |
| tmp7 = tl.load(in_ptr1 + (x0), xmask) | |
| tmp3 = 6272.0 | |
| tmp4 = tmp2 / tmp3 | |
| tmp5 = 0.1 | |
| tmp6 = tmp4 * tmp5 | |
| tmp8 = 0.9 | |
| tmp9 = tmp7 * tmp8 | |
| tmp10 = tmp6 + tmp9 | |
| tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask) | |
| _tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 49 | |
| r2 = (rindex // 49) | |
| tmp11 = tl.load(in_ptr0 + (r1 + (49*x0) + (47040*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp12 = tmp11.to(tl.float32) | |
| tmp13 = tmp12 - tmp4 | |
| tmp14 = tmp13 * tmp13 | |
| _tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15) | |
| tmp15 = tl.sum(_tmp15, 1)[:, None] | |
| tl.store(out_ptr1 + x0, tmp15, xmask) | |
| tmp25 = tl.load(in_ptr2 + (x0), xmask) | |
| tmp16 = 6272.0 | |
| tmp17 = tmp15 / tmp16 | |
| tmp18 = 1e-05 | |
| tmp19 = tmp17 + tmp18 | |
| tmp20 = tl.math.rsqrt(tmp19) | |
| tmp21 = 1.0001594642002871 | |
| tmp22 = tmp17 * tmp21 | |
| tmp23 = 0.1 | |
| tmp24 = tmp22 * tmp23 | |
| tmp26 = 0.9 | |
| tmp27 = tmp25 * tmp26 | |
| tmp28 = tmp24 + tmp27 | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask) | |
| tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_191.run(*args, 960, 6272, grid=grid(960), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_191.benchmark_all_configs(*args, 960, 6272, grid=grid(960)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/6y/c6ybh5xdfuwq6wdolxlmurgpxtgtn6q5shvsfwt5gf4rdpz7tas7.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu | |
| # aten._native_batch_norm_legit_functional => add_242, add_245, convert_element_type_315, convert_element_type_316, mul_371, mul_377, rsqrt_46, sub_46, var_mean_46 | |
| # aten.mean => mean_12 | |
| # aten.silu => convert_element_type_317, convert_element_type_318, mul_378, sigmoid_49 | |
| triton_per_fused__native_batch_norm_legit_functional_mean_silu_192 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |