Created
April 10, 2023 20:17
-
-
Save shunting314/48efc83b12ec3ead950052e4a0220b10 to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from ctypes import c_void_p, c_long | |
| import torch | |
| import math | |
| import random | |
| import os | |
| import tempfile | |
| from torch._inductor.utils import maybe_profile | |
| from torch import empty_strided, as_strided, device | |
| from torch._inductor.codecache import AsyncCompile | |
| from torch._inductor.select_algorithm import extern_kernels | |
| aten = torch.ops.aten | |
| assert_size_stride = torch._C._dynamo.guards.assert_size_stride | |
| async_compile = AsyncCompile() | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.triton_heuristics import grid, start_graph, end_graph | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| # kernel path: /tmp/torchinductor_shunting/qd/cqdbopslmbarnc23s33ybyghfqf2olis2okrueqdtwzk66npkoci.py | |
| # Original ATen: aten._to_copy, aten.sum | |
| # aten._to_copy => convert_element_type_403 | |
| # aten.sum => sum_1 | |
| triton_red_fused__to_copy_sum_0 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[1024, 128], | |
| reduction_hint=ReductionHint.OUTER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 3), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__to_copy_sum_0(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 1000 | |
| rnumel = 128 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex | |
| _tmp1 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (1000*r1)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| _tmp1 = tl.where(rmask & xmask, _tmp1 + tmp0, _tmp1) | |
| tmp1 = tl.sum(_tmp1, 1)[:, None] | |
| tmp2 = tmp1.to(tl.float32) | |
| tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp2, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 1000), (1000, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1000,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__to_copy_sum_0.run(*args, 1000, 128, grid=grid(1000), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__to_copy_sum_0.benchmark_all_configs(*args, 1000, 128, grid=grid(1000)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/v4/cv4lm4gdpfxwjwbfwkrkjjepglxb33ctw5byvwc2ujd3ynyqgfob.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_402 | |
| triton_poi_fused__to_copy_1 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_1(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1536000 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None) | |
| def get_args(): | |
| arg_0 = rand_strided((1000, 1536), (1536, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1000, 1536), (1536, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_1.run(*args, 1536000, grid=grid(1536000), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_1.benchmark_all_configs(*args, 1536000, grid=grid(1536000)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/gw/cgw4eilm5jaevvb4jlwkfuyolkwuloinsrcuxsoiruu5pucw2qoo.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.div, aten.native_batch_norm_backward, aten.threshold_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_398 | |
| # aten.div => div | |
| # aten.native_batch_norm_backward => convert_element_type_404, mul_470, mul_478, sub_58, sum_2, sum_3 | |
| # aten.threshold_backward => scalar_tensor, where | |
| triton_red_fused__native_batch_norm_legit_functional_div_native_batch_norm_backward_threshold_backward_2 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[2048, 8192], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*i1', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_div_native_batch_norm_backward_threshold_backward_2(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 1536 | |
| rnumel = 6272 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex | |
| _tmp7 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| tmp10 = tl.load(in_ptr3 + (x0), xmask) | |
| _tmp13 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 49 | |
| r2 = (rindex // 49) | |
| tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (75264*r2)), rmask & xmask, eviction_policy='evict_last') | |
| tmp2 = tl.load(in_ptr1 + (x0 + (1536*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp8 = tl.load(in_ptr2 + (r1 + (49*x0) + (75264*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = 0.0 | |
| tmp3 = 49.0 | |
| tmp4 = tmp2 / tmp3 | |
| tmp5 = tl.where(tmp0, tmp1, tmp4) | |
| tmp6 = tmp5.to(tl.float32) | |
| _tmp7 = tl.where(rmask & xmask, _tmp7 + tmp6, _tmp7) | |
| tmp9 = tmp8.to(tl.float32) | |
| tmp11 = tmp9 - tmp10 | |
| tmp12 = tmp6 * tmp11 | |
| _tmp13 = tl.where(rmask & xmask, _tmp13 + tmp12, _tmp13) | |
| tmp7 = tl.sum(_tmp7, 1)[:, None] | |
| tl.store(out_ptr0 + x0, tmp7, xmask) | |
| tmp13 = tl.sum(_tmp13, 1)[:, None] | |
| tl.store(out_ptr1 + x0, tmp13, xmask) | |
| tmp14 = tl.load(in_ptr4 + (x0), xmask) | |
| tmp15 = tmp13 * tmp14 | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp15, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda:0', dtype=torch.bool) | |
| arg_1 = rand_strided((128, 1536), (1536, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((1, 1536, 1, 1), (1536, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_div_native_batch_norm_backward_threshold_backward_2.run(*args, 1536, 6272, grid=grid(1536), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_div_native_batch_norm_backward_threshold_backward_2.benchmark_all_configs(*args, 1536, 6272, grid=grid(1536)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/kf/ckfaohmrzufzdx2fsmml5navfg2efvvxodj5sf5pujefshnggoot.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.convolution_backward, aten.div, aten.native_batch_norm_backward, aten.threshold_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_398 | |
| # aten.convolution_backward => convolution_backward | |
| # aten.div => div | |
| # aten.native_batch_norm_backward => convert_element_type_404, convert_element_type_406, mul_476, mul_477, sub_58, sub_60, sub_61 | |
| # aten.threshold_backward => scalar_tensor, where | |
| triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_div_native_batch_norm_backward_threshold_backward_3 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*i1', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp16', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_div_native_batch_norm_backward_threshold_backward_3(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 9633792 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x4 = (xindex // 49) | |
| x1 = (xindex // 49) % 1536 | |
| tmp0 = tl.load(in_ptr0 + (x3), None) | |
| tmp2 = tl.load(in_ptr1 + (x4), None).to(tl.float32) | |
| tmp7 = tl.load(in_ptr2 + (x3), None).to(tl.float32) | |
| tmp9 = tl.load(in_ptr3 + (x1), None) | |
| tmp11 = tl.load(in_ptr4 + (x1), None) | |
| tmp14 = tl.load(in_ptr5 + (x1), None) | |
| tmp19 = tl.load(in_ptr6 + (x1), None) | |
| tmp22 = tl.load(in_ptr7 + (x1), None) | |
| tmp1 = 0.0 | |
| tmp3 = 49.0 | |
| tmp4 = tmp2 / tmp3 | |
| tmp5 = tl.where(tmp0, tmp1, tmp4) | |
| tmp6 = tmp5.to(tl.float32) | |
| tmp8 = tmp7.to(tl.float32) | |
| tmp10 = tmp8 - tmp9 | |
| tmp12 = 0.00015943877551020407 | |
| tmp13 = tmp11 * tmp12 | |
| tmp15 = tmp14 * tmp14 | |
| tmp16 = tmp13 * tmp15 | |
| tmp17 = tmp10 * tmp16 | |
| tmp18 = tmp6 - tmp17 | |
| tmp20 = tmp19 * tmp12 | |
| tmp21 = tmp18 - tmp20 | |
| tmp23 = tmp14 * tmp22 | |
| tmp24 = tmp21 * tmp23 | |
| tmp25 = tmp24.to(tl.float32) | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp25, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda:0', dtype=torch.bool) | |
| arg_1 = rand_strided((128, 1536), (1536, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((1, 1536, 1, 1), (1536, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_8 = rand_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_div_native_batch_norm_backward_threshold_backward_3.run(*args, 9633792, grid=grid(9633792), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_div_native_batch_norm_backward_threshold_backward_3.benchmark_all_configs(*args, 9633792, grid=grid(9633792)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/on/consb5x4mndq63g7yk4eommoi2zfnpkemgbkcrodivgrodwe4iek.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_407 | |
| triton_poi_fused__to_copy_4 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[524288], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_4(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 405504 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None) | |
| def get_args(): | |
| arg_0 = rand_strided((1536, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1536, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_4.run(*args, 405504, grid=grid(405504), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_4.benchmark_all_configs(*args, 405504, grid=grid(405504)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/aw/cawn5o3lch25tbuhghexsvhkzdksvz4ofqrpjmzgs2y6ldb5vypm.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_395 | |
| # aten.native_batch_norm_backward => convert_element_type_408, mul_479, mul_487, sub_62, sum_4, sum_5 | |
| triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_5 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[512, 8192], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 8), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_5(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 264 | |
| rnumel = 6272 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex | |
| _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| tmp5 = tl.load(in_ptr2 + (x0), xmask) | |
| _tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 49 | |
| r2 = (rindex // 49) | |
| tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp3 = tl.load(in_ptr1 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
| tmp4 = tmp3.to(tl.float32) | |
| tmp6 = tmp4 - tmp5 | |
| tmp7 = tmp1 * tmp6 | |
| _tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8) | |
| tmp2 = tl.sum(_tmp2, 1)[:, None] | |
| tl.store(out_ptr0 + x0, tmp2, xmask) | |
| tmp8 = tl.sum(_tmp8, 1)[:, None] | |
| tl.store(out_ptr1 + x0, tmp8, xmask) | |
| tmp9 = tl.load(in_ptr3 + (x0), xmask) | |
| tmp10 = tmp8 * tmp9 | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_5.run(*args, 264, 6272, grid=grid(264), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_5.benchmark_all_configs(*args, 264, 6272, grid=grid(264)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/rk/crkzi7f3ev6knrr3kmx5lxzwin43giwhtaru4yx4tnkfj3r3ngqj.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_395 | |
| # aten.native_batch_norm_backward => convert_element_type_408, convert_element_type_410, mul_485, mul_486, sub_62, sub_64, sub_65 | |
| triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_6 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp16', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_6(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1655808 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 49) % 264 | |
| tmp0 = tl.load(in_ptr0 + (x3), xmask).to(tl.float32) | |
| tmp2 = tl.load(in_ptr1 + (x3), xmask).to(tl.float32) | |
| tmp4 = tl.load(in_ptr2 + (x1), xmask) | |
| tmp6 = tl.load(in_ptr3 + (x1), xmask) | |
| tmp9 = tl.load(in_ptr4 + (x1), xmask) | |
| tmp14 = tl.load(in_ptr5 + (x1), xmask) | |
| tmp17 = tl.load(in_ptr6 + (x1), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp2.to(tl.float32) | |
| tmp5 = tmp3 - tmp4 | |
| tmp7 = 0.00015943877551020407 | |
| tmp8 = tmp6 * tmp7 | |
| tmp10 = tmp9 * tmp9 | |
| tmp11 = tmp8 * tmp10 | |
| tmp12 = tmp5 * tmp11 | |
| tmp13 = tmp1 - tmp12 | |
| tmp15 = tmp14 * tmp7 | |
| tmp16 = tmp13 - tmp15 | |
| tmp18 = tmp9 * tmp17 | |
| tmp19 = tmp16 * tmp18 | |
| tmp20 = tmp19.to(tl.float32) | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp20, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_6.run(*args, 1655808, grid=grid(1655808), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_6.benchmark_all_configs(*args, 1655808, grid=grid(1655808)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/sf/csf7df3ryvp5mqalyhiqr2oqixspuwgupmdvdghhbdv4hoavgvib.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_411 | |
| triton_poi_fused__to_copy_7 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_7(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 104544 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_7.run(*args, 104544, grid=grid(104544), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_7.benchmark_all_configs(*args, 104544, grid=grid(104544)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ph/cphopjqs73kagdjs5yonh2gxou75xcoyv4vg5cyr3nkxcgkuz4lp.py | |
| # Original ATen: aten.cat | |
| # aten.cat => cat_41 | |
| triton_poi_fused_cat_8 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_cat_8(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 4967424 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 38808 | |
| x1 = (xindex // 38808) | |
| tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + (77616*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 792, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_cat_8.run(*args, 4967424, grid=grid(4967424), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_cat_8.benchmark_all_configs(*args, 4967424, grid=grid(4967424)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/cd/ccdy3uu6zwicm5gp4nqvsu7scphal4x56e6djga5sd2lge4kiyge.py | |
| # Original ATen: aten.cat | |
| # aten.cat => cat_41 | |
| triton_poi_fused_cat_9 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_cat_9(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 4967424 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 38808 | |
| x1 = (xindex // 38808) | |
| tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + (77616*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 792, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_cat_9.run(*args, 4967424, grid=grid(4967424), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_cat_9.benchmark_all_configs(*args, 4967424, grid=grid(4967424)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/nt/cnt4hzkf7uueins7in5idjrq44bsdsgn6e67gfrc7gdtwfyotkdg.py | |
| # Original ATen: aten.mul, aten.sigmoid, aten.sigmoid_backward, aten.silu, aten.sum | |
| # aten.mul => mul_488 | |
| # aten.sigmoid => sigmoid_63 | |
| # aten.sigmoid_backward => convert_element_type_413, convert_element_type_414, convert_element_type_415, mul_490, mul_491, sub_66 | |
| # aten.silu => convert_element_type_385, convert_element_type_386, mul_453, sigmoid_61 | |
| # aten.sum => sum_6 | |
| triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_10 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[262144, 64], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_10(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 202752 | |
| rnumel = 49 | |
| RBLOCK: tl.constexpr = 64 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (r1 + (49*x0)), rmask, other=0).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (r1 + (49*x0)), rmask, other=0).to(tl.float32) | |
| tmp11 = tl.load(in_ptr2 + (x0), None).to(tl.float32) | |
| tmp2 = tmp1.to(tl.float32) | |
| tmp3 = tl.sigmoid(tmp2) | |
| tmp4 = tmp2 * tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tmp6 = tmp0 * tmp5 | |
| tmp8 = tl.where(rmask, tmp6, 0) | |
| tmp9 = tl.sum(tmp8, 1)[:, None] | |
| tmp10 = tmp9.to(tl.float32) | |
| tmp12 = tl.sigmoid(tmp11) | |
| tmp13 = tmp12.to(tl.float32) | |
| tmp14 = 1.0 | |
| tmp15 = tmp14 - tmp13 | |
| tmp16 = tmp13 * tmp15 | |
| tmp17 = tmp10 * tmp16 | |
| tmp18 = tmp17.to(tl.float32) | |
| tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp18, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_10.run(*args, 202752, 49, grid=grid(202752), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_10.benchmark_all_configs(*args, 202752, 49, grid=grid(202752)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/wr/cwr445l7k4eo6gcjw7l27usycvmf5jtqzbzk4btgcg2wqemciphj.py | |
| # Original ATen: aten._to_copy, aten.convolution_backward | |
| # aten._to_copy => convert_element_type_417 | |
| # aten.convolution_backward => sum_7 | |
| triton_per_fused__to_copy_convolution_backward_11 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[2048, 128], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__to_copy_convolution_backward_11(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1584 | |
| rnumel = 128 | |
| RBLOCK: tl.constexpr = 128 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (1584*r1)), rmask & xmask, other=0).to(tl.float32) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__to_copy_convolution_backward_11.run(*args, 1584, 128, grid=grid(1584), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__to_copy_convolution_backward_11.benchmark_all_configs(*args, 1584, 128, grid=grid(1584)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/vy/cvylenxoapd6e7xfcegwowbaxf7tmjjits4rifowpwoo5xytchjc.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_416 | |
| triton_poi_fused__to_copy_12 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_12(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 209088 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((1584, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1584, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_12.run(*args, 209088, grid=grid(209088), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_12.benchmark_all_configs(*args, 209088, grid=grid(209088)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ku/ckublkguiu5gw5vxkulgbwk62mp5fhr6syry2wrk56p3zdpzupzp.py | |
| # Original ATen: aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub | |
| # aten.add => add_304 | |
| # aten.clone => clone_47 | |
| # aten.fill => full_like | |
| # aten.mul => mul_492, mul_493, mul_494 | |
| # aten.sigmoid => sigmoid_64 | |
| # aten.sub => sub_67 | |
| triton_poi_fused_add_clone_fill_mul_sigmoid_sub_13 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_add_clone_fill_mul_sigmoid_sub_13(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 16896 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = 1.0 | |
| tmp4 = tmp3 - tmp2 | |
| tmp5 = tmp1 * tmp4 | |
| tmp6 = tmp5 + tmp3 | |
| tmp7 = tmp2 * tmp6 | |
| tmp8 = tmp0 * tmp7 | |
| tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp8, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_add_clone_fill_mul_sigmoid_sub_13.run(*args, 16896, grid=grid(16896), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_add_clone_fill_mul_sigmoid_sub_13.benchmark_all_configs(*args, 16896, grid=grid(16896)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/4b/c4bhcaxi2hruo5hil4xfrkdmmucojrb5nsitzxo2oy2a6qcdg3ti.py | |
| # Original ATen: aten._to_copy, aten.convolution_backward | |
| # aten._to_copy => convert_element_type_419 | |
| # aten.convolution_backward => sum_8 | |
| triton_per_fused__to_copy_convolution_backward_14 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[256, 128], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 3), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__to_copy_convolution_backward_14(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 132 | |
| rnumel = 128 | |
| RBLOCK: tl.constexpr = 128 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (132*r1)), rmask & xmask, other=0).to(tl.float32) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((132,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__to_copy_convolution_backward_14.run(*args, 132, 128, grid=grid(132), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__to_copy_convolution_backward_14.benchmark_all_configs(*args, 132, 128, grid=grid(132)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/mo/cmoc7y64jtpxuncjb73voe54cmkijbltrpgf5p7stvr4mny7avka.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_418 | |
| triton_poi_fused__to_copy_15 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_15(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 209088 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((132, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((132, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_15.run(*args, 209088, grid=grid(209088), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_15.benchmark_all_configs(*args, 209088, grid=grid(209088)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/et/cetgnuzvhwgtkuxuh7tdewyrx6qabbxsldzd2qnxluqsqq7zwwfh.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub | |
| # aten._native_batch_norm_legit_functional => convert_element_type_383 | |
| # aten.add => add_305, add_306 | |
| # aten.clone => clone_46 | |
| # aten.div => div_1 | |
| # aten.fill => full_like_1 | |
| # aten.mul => mul_489, mul_495, mul_496, mul_497 | |
| # aten.native_batch_norm_backward => convert_element_type_420, mul_498, mul_506, sub_69, sum_10, sum_9 | |
| # aten.sigmoid => sigmoid_63, sigmoid_65 | |
| # aten.sub => sub_68 | |
| triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_16 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[2048, 8192], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_16(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 1584 | |
| rnumel = 6272 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex | |
| _tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| tmp20 = tl.load(in_ptr5 + (x0), xmask) | |
| _tmp23 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 49 | |
| r2 = (rindex // 49) | |
| tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (x0 + (1584*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp4 = tl.load(in_ptr2 + (x0 + (1584*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp8 = tl.load(in_ptr3 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp18 = tl.load(in_ptr4 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp0 * tmp2 | |
| tmp5 = 49.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = tmp3 + tmp6 | |
| tmp9 = tl.sigmoid(tmp8) | |
| tmp10 = 1.0 | |
| tmp11 = tmp10 - tmp9 | |
| tmp12 = tmp8 * tmp11 | |
| tmp13 = tmp12 + tmp10 | |
| tmp14 = tmp9 * tmp13 | |
| tmp15 = tmp7 * tmp14 | |
| tmp16 = tmp15.to(tl.float32) | |
| _tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17) | |
| tmp19 = tmp18.to(tl.float32) | |
| tmp21 = tmp19 - tmp20 | |
| tmp22 = tmp16 * tmp21 | |
| _tmp23 = tl.where(rmask & xmask, _tmp23 + tmp22, _tmp23) | |
| tmp17 = tl.sum(_tmp17, 1)[:, None] | |
| tl.store(out_ptr0 + x0, tmp17, xmask) | |
| tmp23 = tl.sum(_tmp23, 1)[:, None] | |
| tl.store(out_ptr1 + x0, tmp23, xmask) | |
| tmp24 = tl.load(in_ptr6 + (x0), xmask) | |
| tmp25 = tmp23 * tmp24 | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp25, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_4 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_5 = rand_strided((1, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_8 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_9 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_16.run(*args, 1584, 6272, grid=grid(1584), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_16.benchmark_all_configs(*args, 1584, 6272, grid=grid(1584)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/vq/cvqzgmyak74w6zf55f32ognvdbwijbc4xvxna4rdk6flp32c4rx3.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub | |
| # aten._native_batch_norm_legit_functional => convert_element_type_383 | |
| # aten.add => add_305, add_306 | |
| # aten.clone => clone_46 | |
| # aten.div => div_1 | |
| # aten.fill => full_like_1 | |
| # aten.mul => mul_489, mul_495, mul_496, mul_497 | |
| # aten.native_batch_norm_backward => convert_element_type_420, mul_504, sub_69, sub_71, sub_72 | |
| # aten.sigmoid => sigmoid_63, sigmoid_65 | |
| # aten.sub => sub_68 | |
| triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_17 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_17(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 9934848 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x4 = (xindex // 49) | |
| x1 = (xindex // 49) % 1584 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32) | |
| tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32) | |
| tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32) | |
| tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32) | |
| tmp19 = tl.load(in_ptr5 + (x1), None) | |
| tmp21 = tl.load(in_ptr6 + (x1), None) | |
| tmp24 = tl.load(in_ptr7 + (x1), None) | |
| tmp29 = tl.load(in_ptr8 + (x1), None) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp0 * tmp2 | |
| tmp5 = 49.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = tmp3 + tmp6 | |
| tmp9 = tl.sigmoid(tmp8) | |
| tmp10 = 1.0 | |
| tmp11 = tmp10 - tmp9 | |
| tmp12 = tmp8 * tmp11 | |
| tmp13 = tmp12 + tmp10 | |
| tmp14 = tmp9 * tmp13 | |
| tmp15 = tmp7 * tmp14 | |
| tmp16 = tmp15.to(tl.float32) | |
| tmp18 = tmp17.to(tl.float32) | |
| tmp20 = tmp18 - tmp19 | |
| tmp22 = 0.00015943877551020407 | |
| tmp23 = tmp21 * tmp22 | |
| tmp25 = tmp24 * tmp24 | |
| tmp26 = tmp23 * tmp25 | |
| tmp27 = tmp20 * tmp26 | |
| tmp28 = tmp16 - tmp27 | |
| tmp30 = tmp29 * tmp22 | |
| tmp31 = tmp28 - tmp30 | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp31, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_4 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_5 = rand_strided((1, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_8 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_9 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_17.run(*args, 9934848, grid=grid(9934848), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_17.benchmark_all_configs(*args, 9934848, grid=grid(9934848)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/4k/c4k5hrio7ng5ql4zfsckty7aijjdguidt3u7uu2czayedkp6afcd.py | |
| # Original ATen: aten.convolution_backward | |
| # aten.convolution_backward => convolution_backward_5 | |
| triton_poi_fused_convolution_backward_18 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_convolution_backward_18(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2483712 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = (xindex // 19404) | |
| x3 = xindex % 19404 | |
| x1 = (xindex // 49) % 396 | |
| x4 = xindex | |
| tmp0 = tl.load(in_ptr0 + (58212 + x3 + (77616*x2)), xmask) | |
| tmp1 = tl.load(in_ptr1 + (1188 + x1), xmask) | |
| tmp2 = tl.load(in_ptr2 + (1188 + x1), xmask) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp0 * tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_convolution_backward_18.run(*args, 2483712, grid=grid(2483712), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_convolution_backward_18.benchmark_all_configs(*args, 2483712, grid=grid(2483712)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/le/clevrgmaajodoycxcxjtmbwkkhfu2zyqxjmzkxois2uwwej5n47c.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_423 | |
| triton_poi_fused__to_copy_19 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_19(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 32076 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((396, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((396, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_19.run(*args, 32076, grid=grid(32076), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_19.benchmark_all_configs(*args, 32076, grid=grid(32076)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/yx/cyxchtoyxooc3k4ah4othvbsdd77dfkbeykdlyzawbwqv4zeq227.py | |
| # Original ATen: aten.convolution_backward | |
| # aten.convolution_backward => convolution_backward_6 | |
| triton_poi_fused_convolution_backward_20 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_convolution_backward_20(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2483712 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = (xindex // 19404) | |
| x3 = xindex % 19404 | |
| x1 = (xindex // 49) % 396 | |
| x4 = xindex | |
| tmp0 = tl.load(in_ptr0 + (38808 + x3 + (77616*x2)), xmask) | |
| tmp1 = tl.load(in_ptr1 + (792 + x1), xmask) | |
| tmp2 = tl.load(in_ptr2 + (792 + x1), xmask) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp0 * tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_convolution_backward_20.run(*args, 2483712, grid=grid(2483712), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_convolution_backward_20.benchmark_all_configs(*args, 2483712, grid=grid(2483712)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/eb/cebicyjdxyev2ctnv4eqoljt3ynagd7ac2xafkomzznzw2akhxue.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_424 | |
| triton_poi_fused__to_copy_21 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_21(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 19404 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((396, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((396, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_21.run(*args, 19404, grid=grid(19404), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_21.benchmark_all_configs(*args, 19404, grid=grid(19404)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/n6/cn6zlzicefkme4u5z2u7vgzu7zs36znck62asy3a53dhtgwtnfey.py | |
| # Original ATen: aten.convolution_backward | |
| # aten.convolution_backward => convolution_backward_7 | |
| triton_poi_fused_convolution_backward_22 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_convolution_backward_22(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2483712 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = (xindex // 19404) | |
| x3 = xindex % 19404 | |
| x1 = (xindex // 49) % 396 | |
| x4 = xindex | |
| tmp0 = tl.load(in_ptr0 + (19404 + x3 + (77616*x2)), xmask) | |
| tmp1 = tl.load(in_ptr1 + (396 + x1), xmask) | |
| tmp2 = tl.load(in_ptr2 + (396 + x1), xmask) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp0 * tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_convolution_backward_22.run(*args, 2483712, grid=grid(2483712), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_convolution_backward_22.benchmark_all_configs(*args, 2483712, grid=grid(2483712)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/5j/c5jiqoftvlcj3ob2vawgml2t4mfu74vouj3eu6ozelrenm5a7frc.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_425 | |
| triton_poi_fused__to_copy_23 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_23(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 9900 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((396, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((396, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_23.run(*args, 9900, grid=grid(9900), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_23.benchmark_all_configs(*args, 9900, grid=grid(9900)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/wd/cwd5qp57lsltp4fq4manegrstgskzimmaukeha4dh4fnqhtwaydf.py | |
| # Original ATen: aten.convolution_backward | |
| # aten.convolution_backward => convolution_backward_8 | |
| triton_poi_fused_convolution_backward_24 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_convolution_backward_24(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2483712 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = (xindex // 19404) | |
| x3 = xindex % 19404 | |
| x1 = (xindex // 49) % 396 | |
| x4 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x3 + (77616*x2)), xmask) | |
| tmp1 = tl.load(in_ptr1 + (x1), xmask) | |
| tmp2 = tl.load(in_ptr2 + (x1), xmask) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp0 * tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_convolution_backward_24.run(*args, 2483712, grid=grid(2483712), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_convolution_backward_24.benchmark_all_configs(*args, 2483712, grid=grid(2483712)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/wc/cwc7i4csdkpcwn5beyvhsemwhtbq7gxbezkowlgoa2xhw4y7krz2.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_426 | |
| triton_poi_fused__to_copy_25 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_25(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3564 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((396, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((396, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_25.run(*args, 3564, grid=grid(3564), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_25.benchmark_all_configs(*args, 3564, grid=grid(3564)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/cp/ccpi5jja3x5efmayn24o6k4nqs6v2zc5jff75of6woeln6emtny2.py | |
| # Original ATen: aten.cat | |
| # aten.cat => cat_42 | |
| triton_poi_fused_cat_26 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_cat_26(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2483712 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 19404 | |
| x1 = (xindex // 19404) | |
| tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + (77616*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 396, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_cat_26.run(*args, 2483712, grid=grid(2483712), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_cat_26.benchmark_all_configs(*args, 2483712, grid=grid(2483712)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/rg/crgc5ihkw5zltfd3tfchnahu2yx2wga4mambpixjsb7mmeynkmrh.py | |
| # Original ATen: aten.cat | |
| # aten.cat => cat_42 | |
| triton_poi_fused_cat_27 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_cat_27(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2483712 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 19404 | |
| x1 = (xindex // 19404) | |
| tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + (77616*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 396, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_cat_27.run(*args, 2483712, grid=grid(2483712), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_cat_27.benchmark_all_configs(*args, 2483712, grid=grid(2483712)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/hq/chqi5nwugaohgw3qon7t3fytiqnvkhkemtghder4rphksh2dt7uc.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.mul, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_375 | |
| # aten.mul => mul_509 | |
| # aten.native_batch_norm_backward => convert_element_type_427, mul_510, mul_518, sub_74, sum_11, sum_12 | |
| triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_28 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[2048, 8192], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_28(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 1584 | |
| rnumel = 6272 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex | |
| _tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| tmp7 = tl.load(in_ptr3 + (x0), xmask) | |
| _tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 49 | |
| r2 = (rindex // 49) | |
| tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp5 = tl.load(in_ptr2 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp2 = tmp0 * tmp1 | |
| tmp3 = tmp2.to(tl.float32) | |
| _tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4) | |
| tmp6 = tmp5.to(tl.float32) | |
| tmp8 = tmp6 - tmp7 | |
| tmp9 = tmp3 * tmp8 | |
| _tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10) | |
| tmp4 = tl.sum(_tmp4, 1)[:, None] | |
| tl.store(out_ptr0 + x0, tmp4, xmask) | |
| tmp10 = tl.sum(_tmp10, 1)[:, None] | |
| tl.store(out_ptr1 + x0, tmp10, xmask) | |
| tmp11 = tl.load(in_ptr4 + (x0), xmask) | |
| tmp12 = tmp10 * tmp11 | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp12, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((1, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_28.run(*args, 1584, 6272, grid=grid(1584), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_28.benchmark_all_configs(*args, 1584, 6272, grid=grid(1584)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/4i/c4iqo4zwyswlhwfnswymdycvvcfufifa735bkh3k5fjuqlgdvt3q.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.convolution_backward, aten.mul, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_375 | |
| # aten.convolution_backward => convolution_backward_9 | |
| # aten.mul => mul_509 | |
| # aten.native_batch_norm_backward => convert_element_type_427, convert_element_type_429, mul_516, mul_517, sub_74, sub_76, sub_77 | |
| triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_29 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_29(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 9934848 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 49) % 1584 | |
| tmp0 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32) | |
| tmp1 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp4 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
| tmp6 = tl.load(in_ptr2 + (x1), None) | |
| tmp8 = tl.load(in_ptr3 + (x1), None) | |
| tmp11 = tl.load(in_ptr4 + (x1), None) | |
| tmp16 = tl.load(in_ptr5 + (x1), None) | |
| tmp19 = tl.load(in_ptr6 + (x1), None) | |
| tmp2 = tmp0 * tmp1 | |
| tmp3 = tmp2.to(tl.float32) | |
| tmp5 = tmp4.to(tl.float32) | |
| tmp7 = tmp5 - tmp6 | |
| tmp9 = 0.00015943877551020407 | |
| tmp10 = tmp8 * tmp9 | |
| tmp12 = tmp11 * tmp11 | |
| tmp13 = tmp10 * tmp12 | |
| tmp14 = tmp7 * tmp13 | |
| tmp15 = tmp3 - tmp14 | |
| tmp17 = tmp16 * tmp9 | |
| tmp18 = tmp15 - tmp17 | |
| tmp20 = tmp11 * tmp19 | |
| tmp21 = tmp18 * tmp20 | |
| tmp22 = tmp21.to(tl.float32) | |
| tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((1, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_29.run(*args, 9934848, grid=grid(9934848), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_29.benchmark_all_configs(*args, 9934848, grid=grid(9934848)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/np/cnpskkb6q6odcd2dqw7z7r4e7p4cgfh5g7s5kyt2zmqwzsyn7azj.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_430 | |
| triton_poi_fused__to_copy_30 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[524288], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_30(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 418176 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((1584, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((1584, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_30.run(*args, 418176, grid=grid(418176), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_30.benchmark_all_configs(*args, 418176, grid=grid(418176)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ut/cuta2ldfadubtnwekrao3mdwks6gtwerw2rxvha7uaqnloaeqvfv.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_372 | |
| # aten.add => add_308 | |
| # aten.native_batch_norm_backward => convert_element_type_431, mul_519, mul_527, sub_78, sum_13, sum_14 | |
| triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_31 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[512, 8192], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 9), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_31(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 264 | |
| rnumel = 6272 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex | |
| _tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| tmp7 = tl.load(in_ptr3 + (x0), xmask) | |
| _tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 49 | |
| r2 = (rindex // 49) | |
| tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp5 = tl.load(in_ptr2 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp2 = tmp0 + tmp1 | |
| tmp3 = tmp2.to(tl.float32) | |
| _tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4) | |
| tmp6 = tmp5.to(tl.float32) | |
| tmp8 = tmp6 - tmp7 | |
| tmp9 = tmp3 * tmp8 | |
| _tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10) | |
| tmp4 = tl.sum(_tmp4, 1)[:, None] | |
| tl.store(out_ptr0 + x0, tmp4, xmask) | |
| tmp10 = tl.sum(_tmp10, 1)[:, None] | |
| tl.store(out_ptr1 + x0, tmp10, xmask) | |
| tmp11 = tl.load(in_ptr4 + (x0), xmask) | |
| tmp12 = tmp10 * tmp11 | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp12, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_31.run(*args, 264, 6272, grid=grid(264), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_31.benchmark_all_configs(*args, 264, 6272, grid=grid(264)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/a3/ca33o5ukggucnj2hrtjksvzkzfzfe3v4qnwzz5ycytjwaauqcvop.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_372 | |
| # aten.add => add_308 | |
| # aten.native_batch_norm_backward => convert_element_type_431, convert_element_type_433, mul_525, mul_526, sub_78, sub_80, sub_81 | |
| triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_32 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp16', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_32(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1655808 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 49) % 264 | |
| tmp0 = tl.load(in_ptr0 + (x3), xmask).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (x3), xmask).to(tl.float32) | |
| tmp4 = tl.load(in_ptr2 + (x3), xmask).to(tl.float32) | |
| tmp6 = tl.load(in_ptr3 + (x1), xmask) | |
| tmp8 = tl.load(in_ptr4 + (x1), xmask) | |
| tmp11 = tl.load(in_ptr5 + (x1), xmask) | |
| tmp16 = tl.load(in_ptr6 + (x1), xmask) | |
| tmp19 = tl.load(in_ptr7 + (x1), xmask) | |
| tmp2 = tmp0 + tmp1 | |
| tmp3 = tmp2.to(tl.float32) | |
| tmp5 = tmp4.to(tl.float32) | |
| tmp7 = tmp5 - tmp6 | |
| tmp9 = 0.00015943877551020407 | |
| tmp10 = tmp8 * tmp9 | |
| tmp12 = tmp11 * tmp11 | |
| tmp13 = tmp10 * tmp12 | |
| tmp14 = tmp7 * tmp13 | |
| tmp15 = tmp3 - tmp14 | |
| tmp17 = tmp16 * tmp9 | |
| tmp18 = tmp15 - tmp17 | |
| tmp20 = tmp11 * tmp19 | |
| tmp21 = tmp18 * tmp20 | |
| tmp22 = tmp21.to(tl.float32) | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_8 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_32.run(*args, 1655808, grid=grid(1655808), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_32.benchmark_all_configs(*args, 1655808, grid=grid(1655808)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/hd/chdicakkg6zydfy4ll5rit2mnugrbccllk6kssfzzb7i55y4shxu.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_349 | |
| # aten.add => add_308, add_313 | |
| # aten.native_batch_norm_backward => convert_element_type_454, mul_559, mul_567, sub_94, sum_22, sum_23 | |
| triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_33 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[512, 8192], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: 'i32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 10), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_33(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 264 | |
| rnumel = 6272 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex | |
| _tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| tmp9 = tl.load(in_ptr4 + (x0), xmask) | |
| _tmp12 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 49 | |
| r2 = (rindex // 49) | |
| tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp3 = tl.load(in_ptr2 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp7 = tl.load(in_ptr3 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp2 = tmp0 + tmp1 | |
| tmp4 = tmp2 + tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| _tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6) | |
| tmp8 = tmp7.to(tl.float32) | |
| tmp10 = tmp8 - tmp9 | |
| tmp11 = tmp5 * tmp10 | |
| _tmp12 = tl.where(rmask & xmask, _tmp12 + tmp11, _tmp12) | |
| tmp6 = tl.sum(_tmp6, 1)[:, None] | |
| tl.store(out_ptr0 + x0, tmp6, xmask) | |
| tmp12 = tl.sum(_tmp12, 1)[:, None] | |
| tl.store(out_ptr1 + x0, tmp12, xmask) | |
| tmp13 = tl.load(in_ptr5 + (x0), xmask) | |
| tmp14 = tmp12 * tmp13 | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp14, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_4 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_8 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_33.run(*args, 264, 6272, grid=grid(264), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_33.benchmark_all_configs(*args, 264, 6272, grid=grid(264)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/bi/cbij3cjdqcet3nxl6uy6e6kdaxljedzlmbuwgif6yfmwghh7x42z.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_349 | |
| # aten.add => add_308, add_313 | |
| # aten.native_batch_norm_backward => convert_element_type_454, mul_565, mul_566, sub_94, sub_96, sub_97 | |
| triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_34 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_34(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1655808 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 49) % 264 | |
| tmp0 = tl.load(in_ptr0 + (x3), xmask).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (x3), xmask).to(tl.float32) | |
| tmp3 = tl.load(in_ptr2 + (x3), xmask).to(tl.float32) | |
| tmp6 = tl.load(in_ptr3 + (x3), xmask).to(tl.float32) | |
| tmp8 = tl.load(in_ptr4 + (x1), xmask) | |
| tmp10 = tl.load(in_ptr5 + (x1), xmask) | |
| tmp13 = tl.load(in_ptr6 + (x1), xmask) | |
| tmp18 = tl.load(in_ptr7 + (x1), xmask) | |
| tmp21 = tl.load(in_ptr8 + (x1), xmask) | |
| tmp2 = tmp0 + tmp1 | |
| tmp4 = tmp2 + tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tmp7 = tmp6.to(tl.float32) | |
| tmp9 = tmp7 - tmp8 | |
| tmp11 = 0.00015943877551020407 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = tmp13 * tmp13 | |
| tmp15 = tmp12 * tmp14 | |
| tmp16 = tmp9 * tmp15 | |
| tmp17 = tmp5 - tmp16 | |
| tmp19 = tmp18 * tmp11 | |
| tmp20 = tmp17 - tmp19 | |
| tmp22 = tmp13 * tmp21 | |
| tmp23 = tmp20 * tmp22 | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp23, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_4 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_8 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_9 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_34.run(*args, 1655808, grid=grid(1655808), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_34.benchmark_all_configs(*args, 1655808, grid=grid(1655808)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/bp/cbpqapqwkj3u7lbjggnc6sqbtdedbchks5ecsczowcbbvpcuekir.py | |
| # Original ATen: aten.convolution_backward | |
| # aten.convolution_backward => convolution_backward_19 | |
| triton_poi_fused_convolution_backward_35 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[1048576], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_convolution_backward_35(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 827904 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 6468 | |
| x1 = (xindex // 6468) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (6468 + x0 + (12936*x1)), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((128, 132, 7, 7), (6468, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_convolution_backward_35.run(*args, 827904, grid=grid(827904), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_convolution_backward_35.benchmark_all_configs(*args, 827904, grid=grid(827904)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/vz/cvzmxblv4iaxi55wdmzkqdehpwn2a2c5kowaaewxyzttkqwmv7jg.py | |
| # Original ATen: aten.convolution_backward | |
| # aten.convolution_backward => convolution_backward_20 | |
| triton_poi_fused_convolution_backward_36 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[1048576], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_convolution_backward_36(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 827904 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 6468 | |
| x1 = (xindex // 6468) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (12936*x1)), xmask) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((128, 132, 7, 7), (6468, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_convolution_backward_36.run(*args, 827904, grid=grid(827904), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_convolution_backward_36.benchmark_all_configs(*args, 827904, grid=grid(827904)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/sb/csb7dsvsxbdzlofqw5ldfragxm7igefprtjmu6wizsp5shcyxiyp.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_326 | |
| # aten.add => add_308, add_313, add_318 | |
| # aten.native_batch_norm_backward => convert_element_type_477, mul_599, mul_607, sub_110, sum_31, sum_32 | |
| triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_37 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[512, 8192], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_37(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 264 | |
| rnumel = 6272 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex | |
| _tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| tmp11 = tl.load(in_ptr5 + (x0), xmask) | |
| _tmp14 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 49 | |
| r2 = (rindex // 49) | |
| tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp3 = tl.load(in_ptr2 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp5 = tl.load(in_ptr3 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp9 = tl.load(in_ptr4 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp2 = tmp0 + tmp1 | |
| tmp4 = tmp2 + tmp3 | |
| tmp6 = tmp4 + tmp5 | |
| tmp7 = tmp6.to(tl.float32) | |
| _tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8) | |
| tmp10 = tmp9.to(tl.float32) | |
| tmp12 = tmp10 - tmp11 | |
| tmp13 = tmp7 * tmp12 | |
| _tmp14 = tl.where(rmask & xmask, _tmp14 + tmp13, _tmp14) | |
| tmp8 = tl.sum(_tmp8, 1)[:, None] | |
| tl.store(out_ptr0 + x0, tmp8, xmask) | |
| tmp14 = tl.sum(_tmp14, 1)[:, None] | |
| tl.store(out_ptr1 + x0, tmp14, xmask) | |
| tmp15 = tl.load(in_ptr6 + (x0), xmask) | |
| tmp16 = tmp14 * tmp15 | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_4 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_5 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_8 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_9 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_37.run(*args, 264, 6272, grid=grid(264), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_37.benchmark_all_configs(*args, 264, 6272, grid=grid(264)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/gx/cgx3bvwbhn4322e66zkvf7xmr6ch3vg75ylfjn7yzts64xagqr2l.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.convolution_backward, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_326 | |
| # aten.add => add_308, add_313, add_318 | |
| # aten.convolution_backward => convolution_backward_28 | |
| # aten.native_batch_norm_backward => convert_element_type_477, convert_element_type_479, mul_605, mul_606, sub_110, sub_112, sub_113 | |
| triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_38 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: '*fp16', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_38(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, in_ptr9, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1655808 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 49) % 264 | |
| tmp0 = tl.load(in_ptr0 + (x3), xmask).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (x3), xmask).to(tl.float32) | |
| tmp3 = tl.load(in_ptr2 + (x3), xmask).to(tl.float32) | |
| tmp5 = tl.load(in_ptr3 + (x3), xmask).to(tl.float32) | |
| tmp8 = tl.load(in_ptr4 + (x3), xmask).to(tl.float32) | |
| tmp10 = tl.load(in_ptr5 + (x1), xmask) | |
| tmp12 = tl.load(in_ptr6 + (x1), xmask) | |
| tmp15 = tl.load(in_ptr7 + (x1), xmask) | |
| tmp20 = tl.load(in_ptr8 + (x1), xmask) | |
| tmp23 = tl.load(in_ptr9 + (x1), xmask) | |
| tmp2 = tmp0 + tmp1 | |
| tmp4 = tmp2 + tmp3 | |
| tmp6 = tmp4 + tmp5 | |
| tmp7 = tmp6.to(tl.float32) | |
| tmp9 = tmp8.to(tl.float32) | |
| tmp11 = tmp9 - tmp10 | |
| tmp13 = 0.00015943877551020407 | |
| tmp14 = tmp12 * tmp13 | |
| tmp16 = tmp15 * tmp15 | |
| tmp17 = tmp14 * tmp16 | |
| tmp18 = tmp11 * tmp17 | |
| tmp19 = tmp7 - tmp18 | |
| tmp21 = tmp20 * tmp13 | |
| tmp22 = tmp19 - tmp21 | |
| tmp24 = tmp15 * tmp23 | |
| tmp25 = tmp22 * tmp24 | |
| tmp26 = tmp25.to(tl.float32) | |
| tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp26, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_4 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_5 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_8 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_9 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_10 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, arg_10, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_38.run(*args, 1655808, grid=grid(1655808), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_38.benchmark_all_configs(*args, 1655808, grid=grid(1655808)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/hv/chvxalrqvfx3gumbgl7gg6d4lhqqg37lbadisxr7scsc3tjuvlp6.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_480 | |
| triton_poi_fused__to_copy_39 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_39(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 253440 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((264, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((264, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_39.run(*args, 253440, grid=grid(253440), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_39.benchmark_all_configs(*args, 253440, grid=grid(253440)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/4c/c4cxprfywuviqaunnoosyvctdmc2wg7xuk2qc4eawthvjfi2nh6c.py | |
| # Original ATen: aten.mul, aten.sigmoid, aten.sigmoid_backward, aten.silu, aten.sum | |
| # aten.mul => mul_608 | |
| # aten.sigmoid => sigmoid_51 | |
| # aten.sigmoid_backward => convert_element_type_481, convert_element_type_482, convert_element_type_483, mul_610, mul_611, sub_114 | |
| # aten.silu => convert_element_type_317, convert_element_type_318, mul_378, sigmoid_49 | |
| # aten.sum => sum_33 | |
| triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_40 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[131072, 64], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_40(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 122880 | |
| rnumel = 49 | |
| RBLOCK: tl.constexpr = 64 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (r1 + (49*x0)), rmask, other=0).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (r1 + (49*x0)), rmask, other=0).to(tl.float32) | |
| tmp11 = tl.load(in_ptr2 + (x0), None).to(tl.float32) | |
| tmp2 = tmp1.to(tl.float32) | |
| tmp3 = tl.sigmoid(tmp2) | |
| tmp4 = tmp2 * tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tmp6 = tmp0 * tmp5 | |
| tmp8 = tl.where(rmask, tmp6, 0) | |
| tmp9 = tl.sum(tmp8, 1)[:, None] | |
| tmp10 = tmp9.to(tl.float32) | |
| tmp12 = tl.sigmoid(tmp11) | |
| tmp13 = tmp12.to(tl.float32) | |
| tmp14 = 1.0 | |
| tmp15 = tmp14 - tmp13 | |
| tmp16 = tmp13 * tmp15 | |
| tmp17 = tmp10 * tmp16 | |
| tmp18 = tmp17.to(tl.float32) | |
| tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp18, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_40.run(*args, 122880, 49, grid=grid(122880), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_40.benchmark_all_configs(*args, 122880, 49, grid=grid(122880)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/c3/cc3ozjdbl56ribso2ymcx7ozxswqoswegte5cm6ndwts2vdg2nx7.py | |
| # Original ATen: aten._to_copy, aten.convolution_backward | |
| # aten._to_copy => convert_element_type_485 | |
| # aten.convolution_backward => sum_34 | |
| triton_per_fused__to_copy_convolution_backward_41 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[1024, 128], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__to_copy_convolution_backward_41(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 960 | |
| rnumel = 128 | |
| RBLOCK: tl.constexpr = 128 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (960*r1)), rmask & xmask, other=0).to(tl.float32) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__to_copy_convolution_backward_41.run(*args, 960, 128, grid=grid(960), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__to_copy_convolution_backward_41.benchmark_all_configs(*args, 960, 128, grid=grid(960)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/gr/cgrr3v6svqr3e2yhrucjdveyox77wsfeqkauijcny3rj2xckclad.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_484 | |
| triton_poi_fused__to_copy_42 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_42(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 76800 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((960, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((960, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_42.run(*args, 76800, grid=grid(76800), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_42.benchmark_all_configs(*args, 76800, grid=grid(76800)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ec/cecvpjtjrr7ig7pxbkmaghimjhrgacci4o2zw2nv422yxcdjktnf.py | |
| # Original ATen: aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub | |
| # aten.add => add_319 | |
| # aten.clone => clone_38 | |
| # aten.fill => full_like_9 | |
| # aten.mul => mul_612, mul_613, mul_614 | |
| # aten.sigmoid => sigmoid_73 | |
| # aten.sub => sub_115 | |
| triton_poi_fused_add_clone_fill_mul_sigmoid_sub_43 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_add_clone_fill_mul_sigmoid_sub_43(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 10240 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_out_ptr0 + (x0), None).to(tl.float32) | |
| tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = 1.0 | |
| tmp4 = tmp3 - tmp2 | |
| tmp5 = tmp1 * tmp4 | |
| tmp6 = tmp5 + tmp3 | |
| tmp7 = tmp2 * tmp6 | |
| tmp8 = tmp0 * tmp7 | |
| tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp8, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_add_clone_fill_mul_sigmoid_sub_43.run(*args, 10240, grid=grid(10240), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_add_clone_fill_mul_sigmoid_sub_43.benchmark_all_configs(*args, 10240, grid=grid(10240)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/tf/ctfzhkol6r3oykhxrse2zzxlztwo7vdgfbyprm7vgxoac6npm7jy.py | |
| # Original ATen: aten._to_copy, aten.convolution_backward | |
| # aten._to_copy => convert_element_type_487 | |
| # aten.convolution_backward => sum_35 | |
| triton_per_fused__to_copy_convolution_backward_44 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[128, 128], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__to_copy_convolution_backward_44(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 80 | |
| rnumel = 128 | |
| RBLOCK: tl.constexpr = 128 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (80*r1)), rmask & xmask, other=0).to(tl.float32) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((80,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__to_copy_convolution_backward_44.run(*args, 80, 128, grid=grid(80), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__to_copy_convolution_backward_44.benchmark_all_configs(*args, 80, 128, grid=grid(80)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/pa/cpatc2nbbyd25gudjvjrjc54kh5oivkwx63acex4sswslv6ylzvi.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_486 | |
| triton_poi_fused__to_copy_45 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_45(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 76800 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((80, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((80, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_45.run(*args, 76800, grid=grid(76800), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_45.benchmark_all_configs(*args, 76800, grid=grid(76800)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/t2/ct2iy46nsnqymwwljjboc6oz6xdfqwrjdsjacocdhzt74zuufoj7.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub | |
| # aten._native_batch_norm_legit_functional => convert_element_type_315 | |
| # aten.add => add_320, add_321 | |
| # aten.clone => clone_37 | |
| # aten.div => div_4 | |
| # aten.fill => full_like_10 | |
| # aten.mul => mul_609, mul_615, mul_616, mul_617 | |
| # aten.native_batch_norm_backward => convert_element_type_488, mul_618, mul_626, sub_117, sum_36, sum_37 | |
| # aten.sigmoid => sigmoid_51, sigmoid_74 | |
| # aten.sub => sub_116 | |
| triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_46 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[1024, 8192], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_46(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 960 | |
| rnumel = 6272 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex | |
| _tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| tmp20 = tl.load(in_ptr5 + (x0), xmask) | |
| _tmp23 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 49 | |
| r2 = (rindex // 49) | |
| tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (47040*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (x0 + (960*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp4 = tl.load(in_ptr2 + (x0 + (960*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp8 = tl.load(in_ptr3 + (r1 + (49*x0) + (47040*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp18 = tl.load(in_ptr4 + (r1 + (49*x0) + (47040*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp0 * tmp2 | |
| tmp5 = 49.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = tmp3 + tmp6 | |
| tmp9 = tl.sigmoid(tmp8) | |
| tmp10 = 1.0 | |
| tmp11 = tmp10 - tmp9 | |
| tmp12 = tmp8 * tmp11 | |
| tmp13 = tmp12 + tmp10 | |
| tmp14 = tmp9 * tmp13 | |
| tmp15 = tmp7 * tmp14 | |
| tmp16 = tmp15.to(tl.float32) | |
| _tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17) | |
| tmp19 = tmp18.to(tl.float32) | |
| tmp21 = tmp19 - tmp20 | |
| tmp22 = tmp16 * tmp21 | |
| _tmp23 = tl.where(rmask & xmask, _tmp23 + tmp22, _tmp23) | |
| tmp17 = tl.sum(_tmp17, 1)[:, None] | |
| tl.store(out_ptr0 + x0, tmp17, xmask) | |
| tmp23 = tl.sum(_tmp23, 1)[:, None] | |
| tl.store(out_ptr1 + x0, tmp23, xmask) | |
| tmp24 = tl.load(in_ptr6 + (x0), xmask) | |
| tmp25 = tmp23 * tmp24 | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp25, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_4 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_5 = rand_strided((1, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_8 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_9 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_46.run(*args, 960, 6272, grid=grid(960), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_46.benchmark_all_configs(*args, 960, 6272, grid=grid(960)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/hn/chn3tshreitokdet2wgwgbsbuqqplci2k63sdatoy3lq6bjeylvs.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub | |
| # aten._native_batch_norm_legit_functional => convert_element_type_315 | |
| # aten.add => add_320, add_321 | |
| # aten.clone => clone_37 | |
| # aten.div => div_4 | |
| # aten.fill => full_like_10 | |
| # aten.mul => mul_609, mul_615, mul_616, mul_617 | |
| # aten.native_batch_norm_backward => convert_element_type_488, mul_624, sub_117, sub_119, sub_120 | |
| # aten.sigmoid => sigmoid_51, sigmoid_74 | |
| # aten.sub => sub_116 | |
| triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_47 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_47(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 6021120 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x4 = (xindex // 49) | |
| x1 = (xindex // 49) % 960 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32) | |
| tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32) | |
| tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32) | |
| tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32) | |
| tmp19 = tl.load(in_ptr5 + (x1), None) | |
| tmp21 = tl.load(in_ptr6 + (x1), None) | |
| tmp24 = tl.load(in_ptr7 + (x1), None) | |
| tmp29 = tl.load(in_ptr8 + (x1), None) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp0 * tmp2 | |
| tmp5 = 49.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = tmp3 + tmp6 | |
| tmp9 = tl.sigmoid(tmp8) | |
| tmp10 = 1.0 | |
| tmp11 = tmp10 - tmp9 | |
| tmp12 = tmp8 * tmp11 | |
| tmp13 = tmp12 + tmp10 | |
| tmp14 = tmp9 * tmp13 | |
| tmp15 = tmp7 * tmp14 | |
| tmp16 = tmp15.to(tl.float32) | |
| tmp18 = tmp17.to(tl.float32) | |
| tmp20 = tmp18 - tmp19 | |
| tmp22 = 0.00015943877551020407 | |
| tmp23 = tmp21 * tmp22 | |
| tmp25 = tmp24 * tmp24 | |
| tmp26 = tmp23 * tmp25 | |
| tmp27 = tmp20 * tmp26 | |
| tmp28 = tmp16 - tmp27 | |
| tmp30 = tmp29 * tmp22 | |
| tmp31 = tmp28 - tmp30 | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp31, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_4 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_5 = rand_strided((1, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_8 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_9 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_47.run(*args, 6021120, grid=grid(6021120), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_47.benchmark_all_configs(*args, 6021120, grid=grid(6021120)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/cm/ccm5w6opbpyzojqp4qntjyz3d5dzh7ybk3ty3hgcp6xmwewzvdux.py | |
| # Original ATen: aten.convolution_backward | |
| # aten.convolution_backward => convolution_backward_31 | |
| triton_poi_fused_convolution_backward_48 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_convolution_backward_48(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1505280 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = (xindex // 11760) | |
| x3 = xindex % 11760 | |
| x1 = (xindex // 49) % 240 | |
| x4 = xindex | |
| tmp0 = tl.load(in_ptr0 + (35280 + x3 + (47040*x2)), None) | |
| tmp1 = tl.load(in_ptr1 + (720 + x1), None) | |
| tmp2 = tl.load(in_ptr2 + (720 + x1), None) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp0 * tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((128, 240, 7, 7), (11760, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_convolution_backward_48.run(*args, 1505280, grid=grid(1505280), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_convolution_backward_48.benchmark_all_configs(*args, 1505280, grid=grid(1505280)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/bg/cbgklbbew275guj34lbxmgcsod4kcchr5225smzpdyhnn7zswlfw.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_491 | |
| triton_poi_fused__to_copy_49 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_49(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 19440 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((240, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((240, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_49.run(*args, 19440, grid=grid(19440), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_49.benchmark_all_configs(*args, 19440, grid=grid(19440)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/aa/caatae3nbvudrfnsq366oeojppmx457zjpvs4yumsehtdmsg23py.py | |
| # Original ATen: aten.convolution_backward | |
| # aten.convolution_backward => convolution_backward_32 | |
| triton_poi_fused_convolution_backward_50 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_convolution_backward_50(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1505280 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = (xindex // 11760) | |
| x3 = xindex % 11760 | |
| x1 = (xindex // 49) % 240 | |
| x4 = xindex | |
| tmp0 = tl.load(in_ptr0 + (23520 + x3 + (47040*x2)), None) | |
| tmp1 = tl.load(in_ptr1 + (480 + x1), None) | |
| tmp2 = tl.load(in_ptr2 + (480 + x1), None) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp0 * tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((128, 240, 7, 7), (11760, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_convolution_backward_50.run(*args, 1505280, grid=grid(1505280), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_convolution_backward_50.benchmark_all_configs(*args, 1505280, grid=grid(1505280)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/iy/ciyzljcwrx3unj6zlla5nxhjqwjz6zppzmuz7wma6tcow2pjfjva.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_492 | |
| triton_poi_fused__to_copy_51 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_51(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 11760 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((240, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((240, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_51.run(*args, 11760, grid=grid(11760), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_51.benchmark_all_configs(*args, 11760, grid=grid(11760)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/6i/c6izu5jaho2aqwpjlvtonq5ar6inxincvvpnnrzzj3npyab2r3ga.py | |
| # Original ATen: aten.convolution_backward | |
| # aten.convolution_backward => convolution_backward_33 | |
| triton_poi_fused_convolution_backward_52 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_convolution_backward_52(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1505280 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = (xindex // 11760) | |
| x3 = xindex % 11760 | |
| x1 = (xindex // 49) % 240 | |
| x4 = xindex | |
| tmp0 = tl.load(in_ptr0 + (11760 + x3 + (47040*x2)), None) | |
| tmp1 = tl.load(in_ptr1 + (240 + x1), None) | |
| tmp2 = tl.load(in_ptr2 + (240 + x1), None) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp0 * tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((128, 240, 7, 7), (11760, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_convolution_backward_52.run(*args, 1505280, grid=grid(1505280), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_convolution_backward_52.benchmark_all_configs(*args, 1505280, grid=grid(1505280)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/cq/ccq24kfa5wdpfsiaanwmmkcev4g2dsxwzynkl2fv553qth6ixrfz.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_493 | |
| triton_poi_fused__to_copy_53 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_53(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 6000 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((240, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((240, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_53.run(*args, 6000, grid=grid(6000), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_53.benchmark_all_configs(*args, 6000, grid=grid(6000)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/iu/ciu2jjei5ptmwjfdzudopltuvgs5jt6yeve7jcdvodi4ozwv2j6u.py | |
| # Original ATen: aten.convolution_backward | |
| # aten.convolution_backward => convolution_backward_34 | |
| triton_poi_fused_convolution_backward_54 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_convolution_backward_54(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1505280 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = (xindex // 11760) | |
| x3 = xindex % 11760 | |
| x1 = (xindex // 49) % 240 | |
| x4 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x3 + (47040*x2)), None) | |
| tmp1 = tl.load(in_ptr1 + (x1), None) | |
| tmp2 = tl.load(in_ptr2 + (x1), None) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp0 * tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((128, 240, 7, 7), (11760, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_convolution_backward_54.run(*args, 1505280, grid=grid(1505280), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_convolution_backward_54.benchmark_all_configs(*args, 1505280, grid=grid(1505280)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/vw/cvw5bsibqyft5fe6qyposdjgeixpccxa5ewo6nsyrtdsomfikzwt.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_494 | |
| triton_poi_fused__to_copy_55 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_55(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2160 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((240, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((240, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_55.run(*args, 2160, grid=grid(2160), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_55.benchmark_all_configs(*args, 2160, grid=grid(2160)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/zd/czdohgrrggyfbylj276pqwjy7zlv5b2cu55bzfn6nk26nmdmxmon.py | |
| # Original ATen: aten.cat | |
| # aten.cat => cat_47 | |
| triton_poi_fused_cat_56 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_cat_56(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 6021120 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 47040 | |
| x1 = (xindex // 47040) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + (188160*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 240, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_cat_56.run(*args, 6021120, grid=grid(6021120), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_cat_56.benchmark_all_configs(*args, 6021120, grid=grid(6021120)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/6g/c6guclv6h7xk7kazbyjy6st57pkopg5lzsmit37ifpr7d46xtfho.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.mul, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_307 | |
| # aten.mul => mul_629 | |
| # aten.native_batch_norm_backward => convert_element_type_495, mul_630, mul_638, sub_122, sum_38, sum_39 | |
| triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_57 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[1024, 32768], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_57(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 960 | |
| rnumel = 25088 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex | |
| _tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| tmp7 = tl.load(in_ptr3 + (x0), xmask) | |
| _tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 196 | |
| r2 = (rindex // 196) | |
| tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (r1 + (196*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp5 = tl.load(in_ptr2 + (r1 + (196*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp2 = tmp0 * tmp1 | |
| tmp3 = tmp2.to(tl.float32) | |
| _tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4) | |
| tmp6 = tmp5.to(tl.float32) | |
| tmp8 = tmp6 - tmp7 | |
| tmp9 = tmp3 * tmp8 | |
| _tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10) | |
| tmp4 = tl.sum(_tmp4, 1)[:, None] | |
| tl.store(out_ptr0 + x0, tmp4, xmask) | |
| tmp10 = tl.sum(_tmp10, 1)[:, None] | |
| tl.store(out_ptr1 + x0, tmp10, xmask) | |
| tmp11 = tl.load(in_ptr4 + (x0), xmask) | |
| tmp12 = tmp10 * tmp11 | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp12, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((1, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_57.run(*args, 960, 25088, grid=grid(960), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_57.benchmark_all_configs(*args, 960, 25088, grid=grid(960)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/kq/ckqzmpbl3qlmroyipqv5z46lwldyab7axrjybp23plftx2joawdv.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.convolution_backward, aten.mul, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_307 | |
| # aten.convolution_backward => convolution_backward_35 | |
| # aten.mul => mul_629 | |
| # aten.native_batch_norm_backward => convert_element_type_495, convert_element_type_497, mul_636, mul_637, sub_122, sub_124, sub_125 | |
| triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_58 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_58(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 24084480 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 196) % 960 | |
| tmp0 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32) | |
| tmp1 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp4 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
| tmp6 = tl.load(in_ptr2 + (x1), None) | |
| tmp8 = tl.load(in_ptr3 + (x1), None) | |
| tmp11 = tl.load(in_ptr4 + (x1), None) | |
| tmp16 = tl.load(in_ptr5 + (x1), None) | |
| tmp19 = tl.load(in_ptr6 + (x1), None) | |
| tmp2 = tmp0 * tmp1 | |
| tmp3 = tmp2.to(tl.float32) | |
| tmp5 = tmp4.to(tl.float32) | |
| tmp7 = tmp5 - tmp6 | |
| tmp9 = 3.985969387755102e-05 | |
| tmp10 = tmp8 * tmp9 | |
| tmp12 = tmp11 * tmp11 | |
| tmp13 = tmp10 * tmp12 | |
| tmp14 = tmp7 * tmp13 | |
| tmp15 = tmp3 - tmp14 | |
| tmp17 = tmp16 * tmp9 | |
| tmp18 = tmp15 - tmp17 | |
| tmp20 = tmp11 * tmp19 | |
| tmp21 = tmp18 * tmp20 | |
| tmp22 = tmp21.to(tl.float32) | |
| tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((1, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_58.run(*args, 24084480, grid=grid(24084480), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_58.benchmark_all_configs(*args, 24084480, grid=grid(24084480)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/v5/cv5qx4oyij3hkwbqtmi4skgsidu4qp4eyvxkday5hyy3p7hf74af.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_498 | |
| triton_poi_fused__to_copy_59 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_59(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 153600 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None) | |
| def get_args(): | |
| arg_0 = rand_strided((960, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((960, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_59.run(*args, 153600, grid=grid(153600), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_59.benchmark_all_configs(*args, 153600, grid=grid(153600)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/vo/cvowglnuesyuof2jgfmeotsjew6qfwzd2dbspy34jb56dgnewazc.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_304 | |
| # aten.native_batch_norm_backward => convert_element_type_499, mul_639, sub_126, sum_40, sum_41 | |
| triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_60 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[1024, 8192], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_60(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 640 | |
| rnumel = 6272 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex % 160 | |
| x1 = (xindex // 160) | |
| _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| x3 = xindex | |
| tmp5 = tl.load(in_ptr2 + (x0), xmask) | |
| _tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| tmp0 = tl.load(in_ptr0 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp3 = tl.load(in_ptr1 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
| tmp4 = tmp3.to(tl.float32) | |
| tmp6 = tmp4 - tmp5 | |
| tmp7 = tmp1 * tmp6 | |
| _tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8) | |
| tmp2 = tl.sum(_tmp2, 1)[:, None] | |
| tl.store(out_ptr0 + x3, tmp2, xmask) | |
| tmp8 = tl.sum(_tmp8, 1)[:, None] | |
| tl.store(out_ptr1 + x3, tmp8, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_60.run(*args, 640, 6272, grid=grid(640), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_60.benchmark_all_configs(*args, 640, 6272, grid=grid(640)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/v7/cv7kgtxh5cmxf5wfqyzhdvjjtikzk2pxpebuocdoaffoo64kkmaj.py | |
| # Original ATen: aten.native_batch_norm_backward | |
| # aten.native_batch_norm_backward => convert_element_type_499, sum_40 | |
| triton_per_fused_native_batch_norm_backward_61 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[256, 4], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused_native_batch_norm_backward_61(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 160 | |
| rnumel = 4 | |
| RBLOCK: tl.constexpr = 4 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (160*r1)), rmask & xmask, other=0) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tl.store(out_ptr0 + x0, tmp3, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused_native_batch_norm_backward_61.run(*args, 160, 4, grid=grid(160), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused_native_batch_norm_backward_61.benchmark_all_configs(*args, 160, 4, grid=grid(160)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ie/cie5izdtlxlumenajauzzkl6hk7z6q5jgsvyedwv7n3ohhf6bq2s.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_304 | |
| # aten.native_batch_norm_backward => convert_element_type_499, mul_639, mul_647, sub_126, sum_41 | |
| triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_62 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[256, 4], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_62(in_ptr0, in_ptr1, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 160 | |
| rnumel = 4 | |
| RBLOCK: tl.constexpr = 4 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (160*r1)), rmask & xmask, other=0) | |
| tmp4 = tl.load(in_ptr1 + (x0), xmask) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp5 = tmp3 * tmp4 | |
| tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask) | |
| tl.store(out_ptr0 + x0, tmp3, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_62.run(*args, 160, 4, grid=grid(160), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_62.benchmark_all_configs(*args, 160, 4, grid=grid(160)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/lu/clucluq3crlm7gshoftvwdbylf2hruep5zy2kdudpczi5ai6bwrm.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_304 | |
| # aten.native_batch_norm_backward => convert_element_type_499, convert_element_type_501, mul_645, mul_646, sub_126, sub_128, sub_129 | |
| triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_63 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp16', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_63(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 4014080 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 196) % 160 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp2 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
| tmp4 = tl.load(in_ptr2 + (x1), None) | |
| tmp6 = tl.load(in_ptr3 + (x1), None) | |
| tmp9 = tl.load(in_ptr4 + (x1), None) | |
| tmp14 = tl.load(in_ptr5 + (x1), None) | |
| tmp17 = tl.load(in_ptr6 + (x1), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp2.to(tl.float32) | |
| tmp5 = tmp3 - tmp4 | |
| tmp7 = 3.985969387755102e-05 | |
| tmp8 = tmp6 * tmp7 | |
| tmp10 = tmp9 * tmp9 | |
| tmp11 = tmp8 * tmp10 | |
| tmp12 = tmp5 * tmp11 | |
| tmp13 = tmp1 - tmp12 | |
| tmp15 = tmp14 * tmp7 | |
| tmp16 = tmp13 - tmp15 | |
| tmp18 = tmp9 * tmp17 | |
| tmp19 = tmp16 * tmp18 | |
| tmp20 = tmp19.to(tl.float32) | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp20, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_63.run(*args, 4014080, grid=grid(4014080), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_63.benchmark_all_configs(*args, 4014080, grid=grid(4014080)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/u6/cu6t7jgofejrpr6jiujpu5wwc5mlffc5k4hwyiqtb7tiyjxa6ec4.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_502 | |
| triton_poi_fused__to_copy_64 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_64(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 19200 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_64.run(*args, 19200, grid=grid(19200), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_64.benchmark_all_configs(*args, 19200, grid=grid(19200)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/rj/crjqpfvxgdlaoka6zisl4dxwjeaxw453cmtsf23px34uebd6u4oo.py | |
| # Original ATen: aten.cat | |
| # aten.cat => cat_48 | |
| triton_poi_fused_cat_65 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_cat_65(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 6021120 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 47040 | |
| x1 = (xindex // 47040) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + (94080*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 240, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_cat_65.run(*args, 6021120, grid=grid(6021120), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_cat_65.benchmark_all_configs(*args, 6021120, grid=grid(6021120)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/rc/crc555gyfmwt6m3oebfawzf77swacndeysz6litdvgmtz7aw3cnb.py | |
| # Original ATen: aten.mul, aten.sigmoid, aten.sigmoid_backward, aten.silu, aten.sum | |
| # aten.mul => mul_648 | |
| # aten.sigmoid => sigmoid_47 | |
| # aten.sigmoid_backward => convert_element_type_504, convert_element_type_505, convert_element_type_506, mul_650, mul_651, sub_130 | |
| # aten.silu => convert_element_type_294, convert_element_type_295, mul_353, sigmoid_45 | |
| # aten.sum => sum_42 | |
| triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_66 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[65536, 256], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_66(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 61440 | |
| rnumel = 196 | |
| RBLOCK: tl.constexpr = 256 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (r1 + (196*x0)), rmask, other=0).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (r1 + (196*x0)), rmask, other=0).to(tl.float32) | |
| tmp11 = tl.load(in_ptr2 + (x0), None).to(tl.float32) | |
| tmp2 = tmp1.to(tl.float32) | |
| tmp3 = tl.sigmoid(tmp2) | |
| tmp4 = tmp2 * tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tmp6 = tmp0 * tmp5 | |
| tmp8 = tl.where(rmask, tmp6, 0) | |
| tmp9 = tl.sum(tmp8, 1)[:, None] | |
| tmp10 = tmp9.to(tl.float32) | |
| tmp12 = tl.sigmoid(tmp11) | |
| tmp13 = tmp12.to(tl.float32) | |
| tmp14 = 1.0 | |
| tmp15 = tmp14 - tmp13 | |
| tmp16 = tmp13 * tmp15 | |
| tmp17 = tmp10 * tmp16 | |
| tmp18 = tmp17.to(tl.float32) | |
| tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp18, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_66.run(*args, 61440, 196, grid=grid(61440), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_66.benchmark_all_configs(*args, 61440, 196, grid=grid(61440)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/n4/cn4fg2jpjbb2zgpwsyrzkkvrz4ql4zzxfplbwfmfeem7estzylry.py | |
| # Original ATen: aten._to_copy, aten.convolution_backward | |
| # aten._to_copy => convert_element_type_508 | |
| # aten.convolution_backward => sum_43 | |
| triton_per_fused__to_copy_convolution_backward_67 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[512, 128], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__to_copy_convolution_backward_67(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 480 | |
| rnumel = 128 | |
| RBLOCK: tl.constexpr = 128 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (480*r1)), rmask & xmask, other=0).to(tl.float32) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__to_copy_convolution_backward_67.run(*args, 480, 128, grid=grid(480), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__to_copy_convolution_backward_67.benchmark_all_configs(*args, 480, 128, grid=grid(480)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/gx/cgx4he25cg6tmu62uuz3hbvmsgstxlvguwwpdavmmk5v7cb53khs.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_507 | |
| triton_poi_fused__to_copy_68 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_68(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 38400 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_68.run(*args, 38400, grid=grid(38400), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_68.benchmark_all_configs(*args, 38400, grid=grid(38400)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/57/c57nn4lxfd7xfnwbsod5xu57xcqkrswf7vdbyz76bditl462irc4.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_509 | |
| triton_poi_fused__to_copy_69 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_69(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 38400 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_69.run(*args, 38400, grid=grid(38400), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_69.benchmark_all_configs(*args, 38400, grid=grid(38400)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/5q/c5qqp6x25kopkdsx6gj3r2vuos77mpwiiyja2ob3sfsgwvxevysq.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub | |
| # aten._native_batch_norm_legit_functional => convert_element_type_292 | |
| # aten.add => add_324, add_325 | |
| # aten.clone => clone_34 | |
| # aten.div => div_5 | |
| # aten.fill => full_like_13 | |
| # aten.mul => mul_649, mul_655, mul_656, mul_657 | |
| # aten.native_batch_norm_backward => convert_element_type_511, mul_658, mul_666, sub_133, sum_45, sum_46 | |
| # aten.sigmoid => sigmoid_47, sigmoid_77 | |
| # aten.sub => sub_132 | |
| triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_70 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[512, 32768], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_70(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 480 | |
| rnumel = 25088 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex | |
| _tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| tmp20 = tl.load(in_ptr5 + (x0), xmask) | |
| _tmp23 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 196 | |
| r2 = (rindex // 196) | |
| tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (x0 + (480*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp4 = tl.load(in_ptr2 + (x0 + (480*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp8 = tl.load(in_ptr3 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp18 = tl.load(in_ptr4 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp0 * tmp2 | |
| tmp5 = 196.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = tmp3 + tmp6 | |
| tmp9 = tl.sigmoid(tmp8) | |
| tmp10 = 1.0 | |
| tmp11 = tmp10 - tmp9 | |
| tmp12 = tmp8 * tmp11 | |
| tmp13 = tmp12 + tmp10 | |
| tmp14 = tmp9 * tmp13 | |
| tmp15 = tmp7 * tmp14 | |
| tmp16 = tmp15.to(tl.float32) | |
| _tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17) | |
| tmp19 = tmp18.to(tl.float32) | |
| tmp21 = tmp19 - tmp20 | |
| tmp22 = tmp16 * tmp21 | |
| _tmp23 = tl.where(rmask & xmask, _tmp23 + tmp22, _tmp23) | |
| tmp17 = tl.sum(_tmp17, 1)[:, None] | |
| tl.store(out_ptr0 + x0, tmp17, xmask) | |
| tmp23 = tl.sum(_tmp23, 1)[:, None] | |
| tl.store(out_ptr1 + x0, tmp23, xmask) | |
| tmp24 = tl.load(in_ptr6 + (x0), xmask) | |
| tmp25 = tmp23 * tmp24 | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp25, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_4 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_5 = rand_strided((1, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_8 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_9 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_70.run(*args, 480, 25088, grid=grid(480), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_70.benchmark_all_configs(*args, 480, 25088, grid=grid(480)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/qc/cqcavzge4fgj7thkzvjlxnonwb5xlzaboopggaaauvq4wfdtklsv.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub | |
| # aten._native_batch_norm_legit_functional => convert_element_type_292 | |
| # aten.add => add_324, add_325 | |
| # aten.clone => clone_34 | |
| # aten.div => div_5 | |
| # aten.fill => full_like_13 | |
| # aten.mul => mul_649, mul_655, mul_656, mul_657 | |
| # aten.native_batch_norm_backward => convert_element_type_511, mul_664, sub_133, sub_135, sub_136 | |
| # aten.sigmoid => sigmoid_47, sigmoid_77 | |
| # aten.sub => sub_132 | |
| triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_71 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_71(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 12042240 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x4 = (xindex // 196) | |
| x1 = (xindex // 196) % 480 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32) | |
| tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32) | |
| tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32) | |
| tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32) | |
| tmp19 = tl.load(in_ptr5 + (x1), None) | |
| tmp21 = tl.load(in_ptr6 + (x1), None) | |
| tmp24 = tl.load(in_ptr7 + (x1), None) | |
| tmp29 = tl.load(in_ptr8 + (x1), None) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp0 * tmp2 | |
| tmp5 = 196.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = tmp3 + tmp6 | |
| tmp9 = tl.sigmoid(tmp8) | |
| tmp10 = 1.0 | |
| tmp11 = tmp10 - tmp9 | |
| tmp12 = tmp8 * tmp11 | |
| tmp13 = tmp12 + tmp10 | |
| tmp14 = tmp9 * tmp13 | |
| tmp15 = tmp7 * tmp14 | |
| tmp16 = tmp15.to(tl.float32) | |
| tmp18 = tmp17.to(tl.float32) | |
| tmp20 = tmp18 - tmp19 | |
| tmp22 = 3.985969387755102e-05 | |
| tmp23 = tmp21 * tmp22 | |
| tmp25 = tmp24 * tmp24 | |
| tmp26 = tmp23 * tmp25 | |
| tmp27 = tmp20 * tmp26 | |
| tmp28 = tmp16 - tmp27 | |
| tmp30 = tmp29 * tmp22 | |
| tmp31 = tmp28 - tmp30 | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp31, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_4 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_5 = rand_strided((1, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_8 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_9 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_71.run(*args, 12042240, grid=grid(12042240), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_71.benchmark_all_configs(*args, 12042240, grid=grid(12042240)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ya/cya3odioikqnsayl73izdekuxqhs5rqfkr2pft6i3rty3tyluanc.py | |
| # Original ATen: aten.convolution_backward | |
| # aten.convolution_backward => convolution_backward_40 | |
| triton_poi_fused_convolution_backward_72 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_convolution_backward_72(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3010560 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = (xindex // 23520) | |
| x3 = xindex % 23520 | |
| x1 = (xindex // 196) % 120 | |
| x4 = xindex | |
| tmp0 = tl.load(in_ptr0 + (70560 + x3 + (94080*x2)), None) | |
| tmp1 = tl.load(in_ptr1 + (360 + x1), None) | |
| tmp2 = tl.load(in_ptr2 + (360 + x1), None) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp0 * tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_convolution_backward_72.run(*args, 3010560, grid=grid(3010560), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_convolution_backward_72.benchmark_all_configs(*args, 3010560, grid=grid(3010560)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/fe/cfetbcwbedjsjshgd4bjuqpljqnapb5dcgp46ajfwwvj2dk7ne2i.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_514 | |
| triton_poi_fused__to_copy_73 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_73(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 9720 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_73.run(*args, 9720, grid=grid(9720), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_73.benchmark_all_configs(*args, 9720, grid=grid(9720)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/iq/ciqofepsagke43sfmdrbzwfpkfw4zvr3els5r3zwk7gigvnpcypw.py | |
| # Original ATen: aten.convolution_backward | |
| # aten.convolution_backward => convolution_backward_41 | |
| triton_poi_fused_convolution_backward_74 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_convolution_backward_74(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3010560 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = (xindex // 23520) | |
| x3 = xindex % 23520 | |
| x1 = (xindex // 196) % 120 | |
| x4 = xindex | |
| tmp0 = tl.load(in_ptr0 + (47040 + x3 + (94080*x2)), None) | |
| tmp1 = tl.load(in_ptr1 + (240 + x1), None) | |
| tmp2 = tl.load(in_ptr2 + (240 + x1), None) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp0 * tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_convolution_backward_74.run(*args, 3010560, grid=grid(3010560), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_convolution_backward_74.benchmark_all_configs(*args, 3010560, grid=grid(3010560)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/dy/cdybcagplfulpsthbc6wmswmxelkrixxqqrrt72daichdjv2sefi.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_515 | |
| triton_poi_fused__to_copy_75 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_75(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 5880 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_75.run(*args, 5880, grid=grid(5880), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_75.benchmark_all_configs(*args, 5880, grid=grid(5880)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/uy/cuyidmj6qefewok45jcr4rfmsmcj6hczovevzzaxuc52uf6hh3nc.py | |
| # Original ATen: aten.convolution_backward | |
| # aten.convolution_backward => convolution_backward_42 | |
| triton_poi_fused_convolution_backward_76 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_convolution_backward_76(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3010560 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = (xindex // 23520) | |
| x3 = xindex % 23520 | |
| x1 = (xindex // 196) % 120 | |
| x4 = xindex | |
| tmp0 = tl.load(in_ptr0 + (23520 + x3 + (94080*x2)), None) | |
| tmp1 = tl.load(in_ptr1 + (120 + x1), None) | |
| tmp2 = tl.load(in_ptr2 + (120 + x1), None) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp0 * tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_convolution_backward_76.run(*args, 3010560, grid=grid(3010560), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_convolution_backward_76.benchmark_all_configs(*args, 3010560, grid=grid(3010560)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ux/cuxkdni67km5gfbdihbo5i44iivskoy6hdt7gubbh5qgb5mkwsob.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_516 | |
| triton_poi_fused__to_copy_77 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_77(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3000 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_77.run(*args, 3000, grid=grid(3000), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_77.benchmark_all_configs(*args, 3000, grid=grid(3000)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/4r/c4rld6qtfki5aedwz7g3eqwmnt7w4mmzwymiotpz4adjvg43hqwk.py | |
| # Original ATen: aten.convolution_backward | |
| # aten.convolution_backward => convolution_backward_43 | |
| triton_poi_fused_convolution_backward_78 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_convolution_backward_78(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3010560 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = (xindex // 23520) | |
| x3 = xindex % 23520 | |
| x1 = (xindex // 196) % 120 | |
| x4 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x3 + (94080*x2)), None) | |
| tmp1 = tl.load(in_ptr1 + (x1), None) | |
| tmp2 = tl.load(in_ptr2 + (x1), None) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp0 * tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_convolution_backward_78.run(*args, 3010560, grid=grid(3010560), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_convolution_backward_78.benchmark_all_configs(*args, 3010560, grid=grid(3010560)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ri/crixsdzqwpepdejipqi3ixt4vlyrmrskq5hgkzn7jumpuurao5h5.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_517 | |
| triton_poi_fused__to_copy_79 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_79(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1080 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_79.run(*args, 1080, grid=grid(1080), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_79.benchmark_all_configs(*args, 1080, grid=grid(1080)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ty/ctycfmevxfpn542kh5agvjqilg3o5r77udr2h3wyqu23mqhlqdxf.py | |
| # Original ATen: aten.cat | |
| # aten.cat => cat_49 | |
| triton_poi_fused_cat_80 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_cat_80(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3010560 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 23520 | |
| x1 = (xindex // 23520) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + (94080*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 120, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_cat_80.run(*args, 3010560, grid=grid(3010560), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_cat_80.benchmark_all_configs(*args, 3010560, grid=grid(3010560)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/7i/c7iiiqxahzgxonlzcskqwzfumjrx6cupqeinuz5pkzk66wqnnnyc.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.mul, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_284 | |
| # aten.mul => mul_669 | |
| # aten.native_batch_norm_backward => convert_element_type_518, mul_670, mul_678, sub_138, sum_47, sum_48 | |
| triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_81 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[512, 32768], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_81(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 480 | |
| rnumel = 25088 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex | |
| _tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| tmp7 = tl.load(in_ptr3 + (x0), xmask) | |
| _tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 196 | |
| r2 = (rindex // 196) | |
| tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp5 = tl.load(in_ptr2 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp2 = tmp0 * tmp1 | |
| tmp3 = tmp2.to(tl.float32) | |
| _tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4) | |
| tmp6 = tmp5.to(tl.float32) | |
| tmp8 = tmp6 - tmp7 | |
| tmp9 = tmp3 * tmp8 | |
| _tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10) | |
| tmp4 = tl.sum(_tmp4, 1)[:, None] | |
| tl.store(out_ptr0 + x0, tmp4, xmask) | |
| tmp10 = tl.sum(_tmp10, 1)[:, None] | |
| tl.store(out_ptr1 + x0, tmp10, xmask) | |
| tmp11 = tl.load(in_ptr4 + (x0), xmask) | |
| tmp12 = tmp10 * tmp11 | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp12, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((1, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_81.run(*args, 480, 25088, grid=grid(480), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_81.benchmark_all_configs(*args, 480, 25088, grid=grid(480)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/v4/cv4k7goz3fg6grlbcjwhdwg6ck3vmlxtj3l76l5phqjwc5cao5b3.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.mul, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_284 | |
| # aten.mul => mul_669 | |
| # aten.native_batch_norm_backward => convert_element_type_518, convert_element_type_520, mul_676, mul_677, sub_138, sub_140, sub_141 | |
| triton_poi_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_82 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_82(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 12042240 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 196) % 480 | |
| tmp0 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32) | |
| tmp1 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp4 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
| tmp6 = tl.load(in_ptr2 + (x1), None) | |
| tmp8 = tl.load(in_ptr3 + (x1), None) | |
| tmp11 = tl.load(in_ptr4 + (x1), None) | |
| tmp16 = tl.load(in_ptr5 + (x1), None) | |
| tmp19 = tl.load(in_ptr6 + (x1), None) | |
| tmp2 = tmp0 * tmp1 | |
| tmp3 = tmp2.to(tl.float32) | |
| tmp5 = tmp4.to(tl.float32) | |
| tmp7 = tmp5 - tmp6 | |
| tmp9 = 3.985969387755102e-05 | |
| tmp10 = tmp8 * tmp9 | |
| tmp12 = tmp11 * tmp11 | |
| tmp13 = tmp10 * tmp12 | |
| tmp14 = tmp7 * tmp13 | |
| tmp15 = tmp3 - tmp14 | |
| tmp17 = tmp16 * tmp9 | |
| tmp18 = tmp15 - tmp17 | |
| tmp20 = tmp11 * tmp19 | |
| tmp21 = tmp18 * tmp20 | |
| tmp22 = tmp21.to(tl.float32) | |
| tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((1, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_82.run(*args, 12042240, grid=grid(12042240), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_82.benchmark_all_configs(*args, 12042240, grid=grid(12042240)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/om/comunyk6xj4yo26dm6rnf6zkadn3bzlwf2dwxosduxrcmjkwwa6u.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_521 | |
| triton_poi_fused__to_copy_83 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_83(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 19200 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_83.run(*args, 19200, grid=grid(19200), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_83.benchmark_all_configs(*args, 19200, grid=grid(19200)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/2y/c2yommjpqgw2nsvcapep2mob6welg6j7akfq7uc4xaaiuzwdalt7.py | |
| # Original ATen: aten.cat | |
| # aten.cat => cat_50 | |
| triton_poi_fused_cat_84 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_cat_84(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2007040 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 15680 | |
| x1 = (xindex // 15680) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + (31360*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 80, 14, 14), (15680, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 80, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_cat_84.run(*args, 2007040, grid=grid(2007040), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_cat_84.benchmark_all_configs(*args, 2007040, grid=grid(2007040)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/zy/czyhduwjwdj7bjl64zkqqaz3rymv5xt54uovrwuvmbs3ylr4n3xb.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_280 | |
| # aten.add => add_327 | |
| # aten.native_batch_norm_backward => convert_element_type_523, mul_679, sub_142, sum_49, sum_50 | |
| triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_85 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[1024, 8192], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: 'i32', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_85(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 640 | |
| rnumel = 6272 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex % 160 | |
| x1 = (xindex // 160) | |
| _tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| x3 = xindex | |
| tmp7 = tl.load(in_ptr3 + (x0), xmask) | |
| _tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| tmp0 = tl.load(in_ptr0 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp5 = tl.load(in_ptr2 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp2 = tmp0 + tmp1 | |
| tmp3 = tmp2.to(tl.float32) | |
| _tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4) | |
| tmp6 = tmp5.to(tl.float32) | |
| tmp8 = tmp6 - tmp7 | |
| tmp9 = tmp3 * tmp8 | |
| _tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10) | |
| tmp4 = tl.sum(_tmp4, 1)[:, None] | |
| tl.store(out_ptr0 + x3, tmp4, xmask) | |
| tmp10 = tl.sum(_tmp10, 1)[:, None] | |
| tl.store(out_ptr1 + x3, tmp10, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_85.run(*args, 640, 6272, grid=grid(640), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_85.benchmark_all_configs(*args, 640, 6272, grid=grid(640)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/5n/c5ngqp5cp6ieu5sc7n3ylgpltjkzcel7tc4dwx3ctbhj3ayamet2.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_280 | |
| # aten.add => add_327 | |
| # aten.native_batch_norm_backward => convert_element_type_523, convert_element_type_525, mul_685, mul_686, sub_142, sub_144, sub_145 | |
| triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_86 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp16', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_86(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 4014080 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 196) % 160 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
| tmp4 = tl.load(in_ptr2 + (x3), None).to(tl.float32) | |
| tmp6 = tl.load(in_ptr3 + (x1), None) | |
| tmp8 = tl.load(in_ptr4 + (x1), None) | |
| tmp11 = tl.load(in_ptr5 + (x1), None) | |
| tmp16 = tl.load(in_ptr6 + (x1), None) | |
| tmp19 = tl.load(in_ptr7 + (x1), None) | |
| tmp2 = tmp0 + tmp1 | |
| tmp3 = tmp2.to(tl.float32) | |
| tmp5 = tmp4.to(tl.float32) | |
| tmp7 = tmp5 - tmp6 | |
| tmp9 = 3.985969387755102e-05 | |
| tmp10 = tmp8 * tmp9 | |
| tmp12 = tmp11 * tmp11 | |
| tmp13 = tmp10 * tmp12 | |
| tmp14 = tmp7 * tmp13 | |
| tmp15 = tmp3 - tmp14 | |
| tmp17 = tmp16 * tmp9 | |
| tmp18 = tmp15 - tmp17 | |
| tmp20 = tmp11 * tmp19 | |
| tmp21 = tmp18 * tmp20 | |
| tmp22 = tmp21.to(tl.float32) | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_8 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_86.run(*args, 4014080, grid=grid(4014080), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_86.benchmark_all_configs(*args, 4014080, grid=grid(4014080)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/mi/cmi2g34rcnn4o7hkiq27yemv23mvhmf7ilu2wl6n5igdib4lbn3v.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_256 | |
| # aten.add => add_327, add_332 | |
| # aten.native_batch_norm_backward => convert_element_type_547, mul_719, sub_158, sum_58, sum_59 | |
| triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_87 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[1024, 8192], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_87(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 640 | |
| rnumel = 6272 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex % 160 | |
| x1 = (xindex // 160) | |
| _tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| x3 = xindex | |
| tmp9 = tl.load(in_ptr4 + (x0), xmask) | |
| _tmp12 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| tmp0 = tl.load(in_ptr0 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp3 = tl.load(in_ptr2 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp7 = tl.load(in_ptr3 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp2 = tmp0 + tmp1 | |
| tmp4 = tmp2 + tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| _tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6) | |
| tmp8 = tmp7.to(tl.float32) | |
| tmp10 = tmp8 - tmp9 | |
| tmp11 = tmp5 * tmp10 | |
| _tmp12 = tl.where(rmask & xmask, _tmp12 + tmp11, _tmp12) | |
| tmp6 = tl.sum(_tmp6, 1)[:, None] | |
| tl.store(out_ptr0 + x3, tmp6, xmask) | |
| tmp12 = tl.sum(_tmp12, 1)[:, None] | |
| tl.store(out_ptr1 + x3, tmp12, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_4 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_87.run(*args, 640, 6272, grid=grid(640), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_87.benchmark_all_configs(*args, 640, 6272, grid=grid(640)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/5m/c5msj6p2vdag4fs3dze5gd2kastkwiprxfqobb4ipbu3zwk6y4tb.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_256 | |
| # aten.add => add_327, add_332 | |
| # aten.native_batch_norm_backward => convert_element_type_547, mul_725, mul_726, sub_158, sub_160, sub_161 | |
| triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_88 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_88(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 4014080 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 196) % 160 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
| tmp3 = tl.load(in_ptr2 + (x3), None).to(tl.float32) | |
| tmp6 = tl.load(in_ptr3 + (x3), None).to(tl.float32) | |
| tmp8 = tl.load(in_ptr4 + (x1), None) | |
| tmp10 = tl.load(in_ptr5 + (x1), None) | |
| tmp13 = tl.load(in_ptr6 + (x1), None) | |
| tmp18 = tl.load(in_ptr7 + (x1), None) | |
| tmp21 = tl.load(in_ptr8 + (x1), None) | |
| tmp2 = tmp0 + tmp1 | |
| tmp4 = tmp2 + tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tmp7 = tmp6.to(tl.float32) | |
| tmp9 = tmp7 - tmp8 | |
| tmp11 = 3.985969387755102e-05 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = tmp13 * tmp13 | |
| tmp15 = tmp12 * tmp14 | |
| tmp16 = tmp9 * tmp15 | |
| tmp17 = tmp5 - tmp16 | |
| tmp19 = tmp18 * tmp11 | |
| tmp20 = tmp17 - tmp19 | |
| tmp22 = tmp13 * tmp21 | |
| tmp23 = tmp20 * tmp22 | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp23, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_4 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_8 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_9 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_88.run(*args, 4014080, grid=grid(4014080), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_88.benchmark_all_configs(*args, 4014080, grid=grid(4014080)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/33/c33mk7jbf3hbussw7bzc7k6jlfrdr6q6k2jcx5wwhkb43usq3f3k.py | |
| # Original ATen: aten.convolution_backward | |
| # aten.convolution_backward => convolution_backward_56 | |
| triton_poi_fused_convolution_backward_89 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_convolution_backward_89(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2007040 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 15680 | |
| x1 = (xindex // 15680) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (15680 + x0 + (31360*x1)), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((128, 80, 14, 14), (15680, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_convolution_backward_89.run(*args, 2007040, grid=grid(2007040), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_convolution_backward_89.benchmark_all_configs(*args, 2007040, grid=grid(2007040)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/qs/cqsgmcz5zjvb6ko7kxhkl5io7i4gmvcpy5zxwjwu7qjx5oya6hd3.py | |
| # Original ATen: aten.convolution_backward | |
| # aten.convolution_backward => convolution_backward_57 | |
| triton_poi_fused_convolution_backward_90 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_convolution_backward_90(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2007040 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 15680 | |
| x1 = (xindex // 15680) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (31360*x1)), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((128, 80, 14, 14), (15680, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_convolution_backward_90.run(*args, 2007040, grid=grid(2007040), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_convolution_backward_90.benchmark_all_configs(*args, 2007040, grid=grid(2007040)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/is/cis35c7i52ytm3zlhduqhkylrq33chwilt3nh2nr2nfagpbn72d4.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_232 | |
| # aten.add => add_327, add_332, add_337 | |
| # aten.native_batch_norm_backward => convert_element_type_571, mul_759, sub_174, sum_67, sum_68 | |
| triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_91 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[1024, 8192], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_91(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 640 | |
| rnumel = 6272 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex % 160 | |
| x1 = (xindex // 160) | |
| _tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| x3 = xindex | |
| tmp11 = tl.load(in_ptr5 + (x0), xmask) | |
| _tmp14 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| tmp0 = tl.load(in_ptr0 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp3 = tl.load(in_ptr2 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp5 = tl.load(in_ptr3 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp9 = tl.load(in_ptr4 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp2 = tmp0 + tmp1 | |
| tmp4 = tmp2 + tmp3 | |
| tmp6 = tmp4 + tmp5 | |
| tmp7 = tmp6.to(tl.float32) | |
| _tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8) | |
| tmp10 = tmp9.to(tl.float32) | |
| tmp12 = tmp10 - tmp11 | |
| tmp13 = tmp7 * tmp12 | |
| _tmp14 = tl.where(rmask & xmask, _tmp14 + tmp13, _tmp14) | |
| tmp8 = tl.sum(_tmp8, 1)[:, None] | |
| tl.store(out_ptr0 + x3, tmp8, xmask) | |
| tmp14 = tl.sum(_tmp14, 1)[:, None] | |
| tl.store(out_ptr1 + x3, tmp14, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_4 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_5 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_91.run(*args, 640, 6272, grid=grid(640), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_91.benchmark_all_configs(*args, 640, 6272, grid=grid(640)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/lj/cljkpj67jhpiomhay7l2fuqfmjz2fox7ndvjbixox242vrhfypjf.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.convolution_backward, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_232 | |
| # aten.add => add_327, add_332, add_337 | |
| # aten.convolution_backward => convolution_backward_66 | |
| # aten.native_batch_norm_backward => convert_element_type_571, convert_element_type_573, mul_765, mul_766, sub_174, sub_176, sub_177 | |
| triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_92 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: '*fp16', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_92(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, in_ptr9, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 4014080 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 196) % 160 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
| tmp3 = tl.load(in_ptr2 + (x3), None).to(tl.float32) | |
| tmp5 = tl.load(in_ptr3 + (x3), None).to(tl.float32) | |
| tmp8 = tl.load(in_ptr4 + (x3), None).to(tl.float32) | |
| tmp10 = tl.load(in_ptr5 + (x1), None) | |
| tmp12 = tl.load(in_ptr6 + (x1), None) | |
| tmp15 = tl.load(in_ptr7 + (x1), None) | |
| tmp20 = tl.load(in_ptr8 + (x1), None) | |
| tmp23 = tl.load(in_ptr9 + (x1), None) | |
| tmp2 = tmp0 + tmp1 | |
| tmp4 = tmp2 + tmp3 | |
| tmp6 = tmp4 + tmp5 | |
| tmp7 = tmp6.to(tl.float32) | |
| tmp9 = tmp8.to(tl.float32) | |
| tmp11 = tmp9 - tmp10 | |
| tmp13 = 3.985969387755102e-05 | |
| tmp14 = tmp12 * tmp13 | |
| tmp16 = tmp15 * tmp15 | |
| tmp17 = tmp14 * tmp16 | |
| tmp18 = tmp11 * tmp17 | |
| tmp19 = tmp7 - tmp18 | |
| tmp21 = tmp20 * tmp13 | |
| tmp22 = tmp19 - tmp21 | |
| tmp24 = tmp15 * tmp23 | |
| tmp25 = tmp22 * tmp24 | |
| tmp26 = tmp25.to(tl.float32) | |
| tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp26, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_4 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_5 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_8 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_9 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_10 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, arg_10, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_92.run(*args, 4014080, grid=grid(4014080), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_92.benchmark_all_configs(*args, 4014080, grid=grid(4014080)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/af/cafjjp3a3kwtimi2i4kv3zpwxjdfuxdqkq7szn2vypz2ywexppw7.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_574 | |
| triton_poi_fused__to_copy_93 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_93(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 99840 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((160, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((160, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_93.run(*args, 99840, grid=grid(99840), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_93.benchmark_all_configs(*args, 99840, grid=grid(99840)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ic/cicgaxe2oqufzttvfjscmbopri2uxa3mejzwrav4jbahpmbb4rby.py | |
| # Original ATen: aten.mul, aten.sigmoid, aten.sigmoid_backward, aten.silu, aten.sum | |
| # aten.mul => mul_768 | |
| # aten.sigmoid => sigmoid_35 | |
| # aten.sigmoid_backward => convert_element_type_575, convert_element_type_576, convert_element_type_577, mul_770, mul_771, sub_178 | |
| # aten.silu => convert_element_type_223, convert_element_type_224, mul_278, sigmoid_33 | |
| # aten.sum => sum_69 | |
| triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_94 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[131072, 256], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_94(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 79872 | |
| rnumel = 196 | |
| RBLOCK: tl.constexpr = 256 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (r1 + (196*x0)), rmask, other=0).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (r1 + (196*x0)), rmask, other=0).to(tl.float32) | |
| tmp11 = tl.load(in_ptr2 + (x0), None).to(tl.float32) | |
| tmp2 = tmp1.to(tl.float32) | |
| tmp3 = tl.sigmoid(tmp2) | |
| tmp4 = tmp2 * tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tmp6 = tmp0 * tmp5 | |
| tmp8 = tl.where(rmask, tmp6, 0) | |
| tmp9 = tl.sum(tmp8, 1)[:, None] | |
| tmp10 = tmp9.to(tl.float32) | |
| tmp12 = tl.sigmoid(tmp11) | |
| tmp13 = tmp12.to(tl.float32) | |
| tmp14 = 1.0 | |
| tmp15 = tmp14 - tmp13 | |
| tmp16 = tmp13 * tmp15 | |
| tmp17 = tmp10 * tmp16 | |
| tmp18 = tmp17.to(tl.float32) | |
| tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp18, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_94.run(*args, 79872, 196, grid=grid(79872), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_94.benchmark_all_configs(*args, 79872, 196, grid=grid(79872)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/sw/cswrgzqc55nc76ttfypz7ruvqwdkjksusjytknavxbcop6g2otp2.py | |
| # Original ATen: aten._to_copy, aten.convolution_backward | |
| # aten._to_copy => convert_element_type_579 | |
| # aten.convolution_backward => sum_70 | |
| triton_per_fused__to_copy_convolution_backward_95 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[1024, 128], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__to_copy_convolution_backward_95(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 624 | |
| rnumel = 128 | |
| RBLOCK: tl.constexpr = 128 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (624*r1)), rmask & xmask, other=0).to(tl.float32) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__to_copy_convolution_backward_95.run(*args, 624, 128, grid=grid(624), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__to_copy_convolution_backward_95.benchmark_all_configs(*args, 624, 128, grid=grid(624)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/rk/crkgwzo3s6rzbbnalsvhwyanngv4n2zgzifhbhpcnc5ysqffl6ac.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_578 | |
| triton_poi_fused__to_copy_96 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_96(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 32448 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((624, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((624, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_96.run(*args, 32448, grid=grid(32448), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_96.benchmark_all_configs(*args, 32448, grid=grid(32448)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/wf/cwfpzmbbmgs237i4rn23twa6vqls763iyjrp76njdwwsj3et2uaf.py | |
| # Original ATen: aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub | |
| # aten.add => add_338 | |
| # aten.clone => clone_26 | |
| # aten.fill => full_like_21 | |
| # aten.mul => mul_772, mul_773, mul_774 | |
| # aten.sigmoid => sigmoid_85 | |
| # aten.sub => sub_179 | |
| triton_poi_fused_add_clone_fill_mul_sigmoid_sub_97 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_add_clone_fill_mul_sigmoid_sub_97(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 6656 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = 1.0 | |
| tmp4 = tmp3 - tmp2 | |
| tmp5 = tmp1 * tmp4 | |
| tmp6 = tmp5 + tmp3 | |
| tmp7 = tmp2 * tmp6 | |
| tmp8 = tmp0 * tmp7 | |
| tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp8, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_add_clone_fill_mul_sigmoid_sub_97.run(*args, 6656, grid=grid(6656), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_add_clone_fill_mul_sigmoid_sub_97.benchmark_all_configs(*args, 6656, grid=grid(6656)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/xt/cxt55nhjkmnbxqvfatybarolu6uxihwoo3cct34gda356pn7scly.py | |
| # Original ATen: aten._to_copy, aten.convolution_backward | |
| # aten._to_copy => convert_element_type_581 | |
| # aten.convolution_backward => sum_71 | |
| triton_per_fused__to_copy_convolution_backward_98 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[64, 128], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 3), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__to_copy_convolution_backward_98(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 52 | |
| rnumel = 128 | |
| RBLOCK: tl.constexpr = 128 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (52*r1)), rmask & xmask, other=0).to(tl.float32) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((52,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__to_copy_convolution_backward_98.run(*args, 52, 128, grid=grid(52), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__to_copy_convolution_backward_98.benchmark_all_configs(*args, 52, 128, grid=grid(52)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/tw/ctwfvbwslvyki3iv4byjhnpgwt4ujawez7njntlfraulnuse3ddu.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_580 | |
| triton_poi_fused__to_copy_99 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_99(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 32448 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((52, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((52, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_99.run(*args, 32448, grid=grid(32448), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_99.benchmark_all_configs(*args, 32448, grid=grid(32448)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/6w/c6wzolzrfmxblwjxnpxkgqezmro2wot3nkz62hajpunjmpkn6enk.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub | |
| # aten._native_batch_norm_legit_functional => convert_element_type_221 | |
| # aten.add => add_339, add_340 | |
| # aten.clone => clone_25 | |
| # aten.div => div_8 | |
| # aten.fill => full_like_22 | |
| # aten.mul => mul_769, mul_775, mul_776, mul_777 | |
| # aten.native_batch_norm_backward => convert_element_type_582, mul_778, mul_786, sub_181, sum_72, sum_73 | |
| # aten.sigmoid => sigmoid_35, sigmoid_86 | |
| # aten.sub => sub_180 | |
| triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_100 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[1024, 32768], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_100(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 624 | |
| rnumel = 25088 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex | |
| _tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| tmp20 = tl.load(in_ptr5 + (x0), xmask) | |
| _tmp23 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 196 | |
| r2 = (rindex // 196) | |
| tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (x0 + (624*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp4 = tl.load(in_ptr2 + (x0 + (624*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp8 = tl.load(in_ptr3 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp18 = tl.load(in_ptr4 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp0 * tmp2 | |
| tmp5 = 196.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = tmp3 + tmp6 | |
| tmp9 = tl.sigmoid(tmp8) | |
| tmp10 = 1.0 | |
| tmp11 = tmp10 - tmp9 | |
| tmp12 = tmp8 * tmp11 | |
| tmp13 = tmp12 + tmp10 | |
| tmp14 = tmp9 * tmp13 | |
| tmp15 = tmp7 * tmp14 | |
| tmp16 = tmp15.to(tl.float32) | |
| _tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17) | |
| tmp19 = tmp18.to(tl.float32) | |
| tmp21 = tmp19 - tmp20 | |
| tmp22 = tmp16 * tmp21 | |
| _tmp23 = tl.where(rmask & xmask, _tmp23 + tmp22, _tmp23) | |
| tmp17 = tl.sum(_tmp17, 1)[:, None] | |
| tl.store(out_ptr0 + x0, tmp17, xmask) | |
| tmp23 = tl.sum(_tmp23, 1)[:, None] | |
| tl.store(out_ptr1 + x0, tmp23, xmask) | |
| tmp24 = tl.load(in_ptr6 + (x0), xmask) | |
| tmp25 = tmp23 * tmp24 | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp25, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_4 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_5 = rand_strided((1, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_8 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_9 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_100.run(*args, 624, 25088, grid=grid(624), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_100.benchmark_all_configs(*args, 624, 25088, grid=grid(624)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/d3/cd3muetrknv5a272355llgd7oce4qcl4nznkklo4quwqquswhe2k.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.convolution_backward, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub | |
| # aten._native_batch_norm_legit_functional => convert_element_type_221 | |
| # aten.add => add_339, add_340 | |
| # aten.clone => clone_25 | |
| # aten.convolution_backward => convolution_backward_69 | |
| # aten.div => div_8 | |
| # aten.fill => full_like_22 | |
| # aten.mul => mul_769, mul_775, mul_776, mul_777 | |
| # aten.native_batch_norm_backward => convert_element_type_582, convert_element_type_584, mul_784, mul_785, sub_181, sub_183, sub_184 | |
| # aten.sigmoid => sigmoid_35, sigmoid_86 | |
| # aten.sub => sub_180 | |
| triton_poi_fused__native_batch_norm_legit_functional_add_clone_convolution_backward_div_fill_mul_native_batch_norm_backward_sigmoid_sub_101 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: '*fp16', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_add_clone_convolution_backward_div_fill_mul_native_batch_norm_backward_sigmoid_sub_101(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, in_ptr9, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 15654912 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x4 = (xindex // 196) | |
| x1 = (xindex // 196) % 624 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32) | |
| tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32) | |
| tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32) | |
| tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32) | |
| tmp19 = tl.load(in_ptr5 + (x1), None) | |
| tmp21 = tl.load(in_ptr6 + (x1), None) | |
| tmp24 = tl.load(in_ptr7 + (x1), None) | |
| tmp29 = tl.load(in_ptr8 + (x1), None) | |
| tmp32 = tl.load(in_ptr9 + (x1), None) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp0 * tmp2 | |
| tmp5 = 196.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = tmp3 + tmp6 | |
| tmp9 = tl.sigmoid(tmp8) | |
| tmp10 = 1.0 | |
| tmp11 = tmp10 - tmp9 | |
| tmp12 = tmp8 * tmp11 | |
| tmp13 = tmp12 + tmp10 | |
| tmp14 = tmp9 * tmp13 | |
| tmp15 = tmp7 * tmp14 | |
| tmp16 = tmp15.to(tl.float32) | |
| tmp18 = tmp17.to(tl.float32) | |
| tmp20 = tmp18 - tmp19 | |
| tmp22 = 3.985969387755102e-05 | |
| tmp23 = tmp21 * tmp22 | |
| tmp25 = tmp24 * tmp24 | |
| tmp26 = tmp23 * tmp25 | |
| tmp27 = tmp20 * tmp26 | |
| tmp28 = tmp16 - tmp27 | |
| tmp30 = tmp29 * tmp22 | |
| tmp31 = tmp28 - tmp30 | |
| tmp33 = tmp24 * tmp32 | |
| tmp34 = tmp31 * tmp33 | |
| tmp35 = tmp34.to(tl.float32) | |
| tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp35, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_4 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_5 = rand_strided((1, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_8 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_9 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_10 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, arg_10, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_add_clone_convolution_backward_div_fill_mul_native_batch_norm_backward_sigmoid_sub_101.run(*args, 15654912, grid=grid(15654912), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_add_clone_convolution_backward_div_fill_mul_native_batch_norm_backward_sigmoid_sub_101.benchmark_all_configs(*args, 15654912, grid=grid(15654912)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/23/c23cf4zcm2zsqnix7cwevl5uqnq6e7k4nrleu5hxvvucflqj7ztk.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_585 | |
| triton_poi_fused__to_copy_102 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_102(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 5616 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((624, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((624, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_102.run(*args, 5616, grid=grid(5616), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_102.benchmark_all_configs(*args, 5616, grid=grid(5616)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/xe/cxe5wowu7imk523mjj4gviz3cjwfkjjg7yenewa23cv4xlyi265g.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.mul, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_216 | |
| # aten.mul => mul_789 | |
| # aten.native_batch_norm_backward => convert_element_type_586, mul_790, mul_798, sub_186, sum_74, sum_75 | |
| triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_103 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[1024, 32768], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_103(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 624 | |
| rnumel = 25088 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex | |
| _tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| tmp7 = tl.load(in_ptr3 + (x0), xmask) | |
| _tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 196 | |
| r2 = (rindex // 196) | |
| tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp5 = tl.load(in_ptr2 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp2 = tmp0 * tmp1 | |
| tmp3 = tmp2.to(tl.float32) | |
| _tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4) | |
| tmp6 = tmp5.to(tl.float32) | |
| tmp8 = tmp6 - tmp7 | |
| tmp9 = tmp3 * tmp8 | |
| _tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10) | |
| tmp4 = tl.sum(_tmp4, 1)[:, None] | |
| tl.store(out_ptr0 + x0, tmp4, xmask) | |
| tmp10 = tl.sum(_tmp10, 1)[:, None] | |
| tl.store(out_ptr1 + x0, tmp10, xmask) | |
| tmp11 = tl.load(in_ptr4 + (x0), xmask) | |
| tmp12 = tmp10 * tmp11 | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp12, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((1, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_103.run(*args, 624, 25088, grid=grid(624), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_103.benchmark_all_configs(*args, 624, 25088, grid=grid(624)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/n6/cn6vgxz3swb2xjokxek7wjtmjhdz6hdzplejuatnteeyvplenl3a.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.convolution_backward, aten.mul, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_216 | |
| # aten.convolution_backward => convolution_backward_70 | |
| # aten.mul => mul_789 | |
| # aten.native_batch_norm_backward => convert_element_type_586, convert_element_type_588, mul_796, mul_797, sub_186, sub_188, sub_189 | |
| triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_104 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_104(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 15654912 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 196) % 624 | |
| tmp0 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32) | |
| tmp1 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp4 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
| tmp6 = tl.load(in_ptr2 + (x1), None) | |
| tmp8 = tl.load(in_ptr3 + (x1), None) | |
| tmp11 = tl.load(in_ptr4 + (x1), None) | |
| tmp16 = tl.load(in_ptr5 + (x1), None) | |
| tmp19 = tl.load(in_ptr6 + (x1), None) | |
| tmp2 = tmp0 * tmp1 | |
| tmp3 = tmp2.to(tl.float32) | |
| tmp5 = tmp4.to(tl.float32) | |
| tmp7 = tmp5 - tmp6 | |
| tmp9 = 3.985969387755102e-05 | |
| tmp10 = tmp8 * tmp9 | |
| tmp12 = tmp11 * tmp11 | |
| tmp13 = tmp10 * tmp12 | |
| tmp14 = tmp7 * tmp13 | |
| tmp15 = tmp3 - tmp14 | |
| tmp17 = tmp16 * tmp9 | |
| tmp18 = tmp15 - tmp17 | |
| tmp20 = tmp11 * tmp19 | |
| tmp21 = tmp18 * tmp20 | |
| tmp22 = tmp21.to(tl.float32) | |
| tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((1, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_104.run(*args, 15654912, grid=grid(15654912), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_104.benchmark_all_configs(*args, 15654912, grid=grid(15654912)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/iv/civ2oft43774ffyg3v5kh23rmteh5qvbuafa2jrfla6exeuvk3sq.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_589 | |
| triton_poi_fused__to_copy_105 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_105(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 64896 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((624, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((624, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_105.run(*args, 64896, grid=grid(64896), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_105.benchmark_all_configs(*args, 64896, grid=grid(64896)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/j7/cj7ui4cmg7zcaen4xj4vc2j54wf3j5vlszxmshcee37dzcpqxdcd.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_213 | |
| # aten.native_batch_norm_backward => convert_element_type_590, mul_799, sub_190, sum_76, sum_77 | |
| triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_106 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[512, 8192], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_106(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 416 | |
| rnumel = 6272 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex % 104 | |
| x1 = (xindex // 104) | |
| _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| x3 = xindex | |
| tmp5 = tl.load(in_ptr2 + (x0), xmask) | |
| _tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| tmp0 = tl.load(in_ptr0 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp3 = tl.load(in_ptr1 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
| tmp4 = tmp3.to(tl.float32) | |
| tmp6 = tmp4 - tmp5 | |
| tmp7 = tmp1 * tmp6 | |
| _tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8) | |
| tmp2 = tl.sum(_tmp2, 1)[:, None] | |
| tl.store(out_ptr0 + x3, tmp2, xmask) | |
| tmp8 = tl.sum(_tmp8, 1)[:, None] | |
| tl.store(out_ptr1 + x3, tmp8, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_106.run(*args, 416, 6272, grid=grid(416), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_106.benchmark_all_configs(*args, 416, 6272, grid=grid(416)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/at/catlmgmveca6eegfxgggc25rzm3umawxz52w4d3665iwc5nkqdhv.py | |
| # Original ATen: aten.native_batch_norm_backward | |
| # aten.native_batch_norm_backward => convert_element_type_590, sum_76 | |
| triton_per_fused_native_batch_norm_backward_107 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[128, 4], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused_native_batch_norm_backward_107(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 104 | |
| rnumel = 4 | |
| RBLOCK: tl.constexpr = 4 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (104*r1)), rmask & xmask, other=0) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tl.store(out_ptr0 + x0, tmp3, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused_native_batch_norm_backward_107.run(*args, 104, 4, grid=grid(104), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused_native_batch_norm_backward_107.benchmark_all_configs(*args, 104, 4, grid=grid(104)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/sw/cswvpkxc2jcvqnphnzcluqix5sdpoevu3y2llapfjncqthtvmuif.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_213 | |
| # aten.native_batch_norm_backward => convert_element_type_590, mul_799, mul_807, sub_190, sum_77 | |
| triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_108 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[128, 4], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_108(in_ptr0, in_ptr1, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 104 | |
| rnumel = 4 | |
| RBLOCK: tl.constexpr = 4 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (104*r1)), rmask & xmask, other=0) | |
| tmp4 = tl.load(in_ptr1 + (x0), xmask) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp5 = tmp3 * tmp4 | |
| tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask) | |
| tl.store(out_ptr0 + x0, tmp3, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_108.run(*args, 104, 4, grid=grid(104), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_108.benchmark_all_configs(*args, 104, 4, grid=grid(104)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/6d/c6dicjfqwciclb5imlva4adfrsslgvcv4kzchyl4zcmedltgvjfk.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_213 | |
| # aten.native_batch_norm_backward => convert_element_type_590, convert_element_type_592, mul_805, mul_806, sub_190, sub_192, sub_193 | |
| triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_109 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp16', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_109(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2609152 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 196) % 104 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp2 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
| tmp4 = tl.load(in_ptr2 + (x1), None) | |
| tmp6 = tl.load(in_ptr3 + (x1), None) | |
| tmp9 = tl.load(in_ptr4 + (x1), None) | |
| tmp14 = tl.load(in_ptr5 + (x1), None) | |
| tmp17 = tl.load(in_ptr6 + (x1), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp2.to(tl.float32) | |
| tmp5 = tmp3 - tmp4 | |
| tmp7 = 3.985969387755102e-05 | |
| tmp8 = tmp6 * tmp7 | |
| tmp10 = tmp9 * tmp9 | |
| tmp11 = tmp8 * tmp10 | |
| tmp12 = tmp5 * tmp11 | |
| tmp13 = tmp1 - tmp12 | |
| tmp15 = tmp14 * tmp7 | |
| tmp16 = tmp13 - tmp15 | |
| tmp18 = tmp9 * tmp17 | |
| tmp19 = tmp16 * tmp18 | |
| tmp20 = tmp19.to(tl.float32) | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp20, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_109.run(*args, 2609152, grid=grid(2609152), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_109.benchmark_all_configs(*args, 2609152, grid=grid(2609152)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/u6/cu6gj4lj3w4o4rbwgegqtcfzcryyabv5bfn4fymj6h576kucjrrk.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_593 | |
| triton_poi_fused__to_copy_110 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_110(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 16224 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_110.run(*args, 16224, grid=grid(16224), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_110.benchmark_all_configs(*args, 16224, grid=grid(16224)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/cq/ccq5bqvz5gysfuffc2onzn5q2itdz5cceokvaxh3r7x3536x7ffr.py | |
| # Original ATen: aten.cat | |
| # aten.cat => cat_57 | |
| triton_poi_fused_cat_111 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_cat_111(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 7827456 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 61152 | |
| x1 = (xindex // 61152) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + (122304*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 312, 14, 14), (61152, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 312, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_cat_111.run(*args, 7827456, grid=grid(7827456), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_cat_111.benchmark_all_configs(*args, 7827456, grid=grid(7827456)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/wo/cwolgc4dw6izbizjkqhmxp5s3e2v7ufjibf3admpysa3tcfcvbwi.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_598 | |
| triton_poi_fused__to_copy_112 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_112(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 16224 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((624, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((624, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_112.run(*args, 16224, grid=grid(16224), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_112.benchmark_all_configs(*args, 16224, grid=grid(16224)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/zs/czsqkpgzlfpo7vf22txwb4vktp6li35xdtdplz23vaooelx52mgz.py | |
| # Original ATen: aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub | |
| # aten.add => add_342 | |
| # aten.clone => clone_23 | |
| # aten.fill => full_like_24 | |
| # aten.mul => mul_812, mul_813, mul_814 | |
| # aten.sigmoid => sigmoid_88 | |
| # aten.sub => sub_195 | |
| triton_poi_fused_add_clone_fill_mul_sigmoid_sub_113 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_add_clone_fill_mul_sigmoid_sub_113(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3328 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = 1.0 | |
| tmp4 = tmp3 - tmp2 | |
| tmp5 = tmp1 * tmp4 | |
| tmp6 = tmp5 + tmp3 | |
| tmp7 = tmp2 * tmp6 | |
| tmp8 = tmp0 * tmp7 | |
| tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp8, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_add_clone_fill_mul_sigmoid_sub_113.run(*args, 3328, grid=grid(3328), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_add_clone_fill_mul_sigmoid_sub_113.benchmark_all_configs(*args, 3328, grid=grid(3328)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/br/cbrdby4wbj6lkywp7ihnm6qs7bmx7qo2e45i7oxtqq4xbl3oog4o.py | |
| # Original ATen: aten._to_copy, aten.convolution_backward | |
| # aten._to_copy => convert_element_type_601 | |
| # aten.convolution_backward => sum_80 | |
| triton_per_fused__to_copy_convolution_backward_114 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[32, 128], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 3), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__to_copy_convolution_backward_114(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 26 | |
| rnumel = 128 | |
| RBLOCK: tl.constexpr = 128 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (26*r1)), rmask & xmask, other=0).to(tl.float32) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((26,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__to_copy_convolution_backward_114.run(*args, 26, 128, grid=grid(26), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__to_copy_convolution_backward_114.benchmark_all_configs(*args, 26, 128, grid=grid(26)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/4u/c4uahgnz2ykwhxsj7bueioargms6lhe32ffh6vzmxhridx2iurdm.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_600 | |
| triton_poi_fused__to_copy_115 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_115(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 16224 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((26, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((26, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_115.run(*args, 16224, grid=grid(16224), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_115.benchmark_all_configs(*args, 16224, grid=grid(16224)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ke/cke6ikrfgpb6sjssa2vwqbbb346zboallywrrkyxxcehakhefz5y.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub | |
| # aten._native_batch_norm_legit_functional => convert_element_type_201 | |
| # aten.add => add_343, add_344 | |
| # aten.clone => clone_22 | |
| # aten.div => div_9 | |
| # aten.fill => full_like_25 | |
| # aten.mul => mul_809, mul_815, mul_816, mul_817 | |
| # aten.native_batch_norm_backward => convert_element_type_602, mul_824, sub_197, sub_199, sub_200 | |
| # aten.sigmoid => sigmoid_31, sigmoid_89 | |
| # aten.sub => sub_196 | |
| triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_116 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_116(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 15654912 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x4 = (xindex // 196) | |
| x1 = (xindex // 196) % 624 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32) | |
| tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32) | |
| tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32) | |
| tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32) | |
| tmp19 = tl.load(in_ptr5 + (x1), None) | |
| tmp21 = tl.load(in_ptr6 + (x1), None) | |
| tmp24 = tl.load(in_ptr7 + (x1), None) | |
| tmp29 = tl.load(in_ptr8 + (x1), None) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp0 * tmp2 | |
| tmp5 = 196.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = tmp3 + tmp6 | |
| tmp9 = tl.sigmoid(tmp8) | |
| tmp10 = 1.0 | |
| tmp11 = tmp10 - tmp9 | |
| tmp12 = tmp8 * tmp11 | |
| tmp13 = tmp12 + tmp10 | |
| tmp14 = tmp9 * tmp13 | |
| tmp15 = tmp7 * tmp14 | |
| tmp16 = tmp15.to(tl.float32) | |
| tmp18 = tmp17.to(tl.float32) | |
| tmp20 = tmp18 - tmp19 | |
| tmp22 = 3.985969387755102e-05 | |
| tmp23 = tmp21 * tmp22 | |
| tmp25 = tmp24 * tmp24 | |
| tmp26 = tmp23 * tmp25 | |
| tmp27 = tmp20 * tmp26 | |
| tmp28 = tmp16 - tmp27 | |
| tmp30 = tmp29 * tmp22 | |
| tmp31 = tmp28 - tmp30 | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp31, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_4 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_5 = rand_strided((1, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_8 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_9 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_116.run(*args, 15654912, grid=grid(15654912), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_116.benchmark_all_configs(*args, 15654912, grid=grid(15654912)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/v7/cv7sl67sz2xaf4l6bv2yby2sj22h2574hqzu7lld6chl75bgzok7.py | |
| # Original ATen: aten.convolution_backward | |
| # aten.convolution_backward => convolution_backward_75 | |
| triton_poi_fused_convolution_backward_117 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_convolution_backward_117(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3913728 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = (xindex // 30576) | |
| x3 = xindex % 30576 | |
| x1 = (xindex // 196) % 156 | |
| x4 = xindex | |
| tmp0 = tl.load(in_ptr0 + (91728 + x3 + (122304*x2)), None) | |
| tmp1 = tl.load(in_ptr1 + (468 + x1), None) | |
| tmp2 = tl.load(in_ptr2 + (468 + x1), None) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp0 * tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_convolution_backward_117.run(*args, 3913728, grid=grid(3913728), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_convolution_backward_117.benchmark_all_configs(*args, 3913728, grid=grid(3913728)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/sn/csnxk2aehntwbtuxvtyjfxxx5pcnnxto57kxv3m57bgn74rxyzdo.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_605 | |
| triton_poi_fused__to_copy_118 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_118(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 12636 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((156, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((156, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_118.run(*args, 12636, grid=grid(12636), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_118.benchmark_all_configs(*args, 12636, grid=grid(12636)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/b2/cb2zxjnunwnxsrlbgr5rnqihgpyuvgs44v74hreykirhvteg3imr.py | |
| # Original ATen: aten.convolution_backward | |
| # aten.convolution_backward => convolution_backward_76 | |
| triton_poi_fused_convolution_backward_119 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_convolution_backward_119(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3913728 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = (xindex // 30576) | |
| x3 = xindex % 30576 | |
| x1 = (xindex // 196) % 156 | |
| x4 = xindex | |
| tmp0 = tl.load(in_ptr0 + (61152 + x3 + (122304*x2)), None) | |
| tmp1 = tl.load(in_ptr1 + (312 + x1), None) | |
| tmp2 = tl.load(in_ptr2 + (312 + x1), None) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp0 * tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_convolution_backward_119.run(*args, 3913728, grid=grid(3913728), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_convolution_backward_119.benchmark_all_configs(*args, 3913728, grid=grid(3913728)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/mq/cmqf6tn342yiwkdctvgld7rocxgxgv3azqztra5gqfjonbr5lfkj.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_606 | |
| triton_poi_fused__to_copy_120 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_120(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 7644 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((156, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((156, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_120.run(*args, 7644, grid=grid(7644), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_120.benchmark_all_configs(*args, 7644, grid=grid(7644)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/c6/cc6x33tgwrw2tamqam7mfhx5ek6rlk7b3nnl7psxxxxvdr23nguf.py | |
| # Original ATen: aten.convolution_backward | |
| # aten.convolution_backward => convolution_backward_77 | |
| triton_poi_fused_convolution_backward_121 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_convolution_backward_121(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3913728 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = (xindex // 30576) | |
| x3 = xindex % 30576 | |
| x1 = (xindex // 196) % 156 | |
| x4 = xindex | |
| tmp0 = tl.load(in_ptr0 + (30576 + x3 + (122304*x2)), None) | |
| tmp1 = tl.load(in_ptr1 + (156 + x1), None) | |
| tmp2 = tl.load(in_ptr2 + (156 + x1), None) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp0 * tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_convolution_backward_121.run(*args, 3913728, grid=grid(3913728), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_convolution_backward_121.benchmark_all_configs(*args, 3913728, grid=grid(3913728)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/bo/cbod67uuatalyrpqcdu3oy7sbucan6ecl2ia4zk35wtdojhhxonj.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_607 | |
| triton_poi_fused__to_copy_122 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_122(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3900 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((156, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((156, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_122.run(*args, 3900, grid=grid(3900), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_122.benchmark_all_configs(*args, 3900, grid=grid(3900)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/rc/crclip5jefet3ymndwzega4qujbberu5gvtz4mjx4deqjbqd4sg3.py | |
| # Original ATen: aten.convolution_backward | |
| # aten.convolution_backward => convolution_backward_78 | |
| triton_poi_fused_convolution_backward_123 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_convolution_backward_123(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3913728 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = (xindex // 30576) | |
| x3 = xindex % 30576 | |
| x1 = (xindex // 196) % 156 | |
| x4 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x3 + (122304*x2)), None) | |
| tmp1 = tl.load(in_ptr1 + (x1), None) | |
| tmp2 = tl.load(in_ptr2 + (x1), None) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp0 * tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_convolution_backward_123.run(*args, 3913728, grid=grid(3913728), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_convolution_backward_123.benchmark_all_configs(*args, 3913728, grid=grid(3913728)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/eu/ceukfvbcwohfoz2lavfjfb2ycp454er54mrt4pq2hnohxkqujrr3.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_608 | |
| triton_poi_fused__to_copy_124 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_124(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1404 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((156, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((156, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_124.run(*args, 1404, grid=grid(1404), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_124.benchmark_all_configs(*args, 1404, grid=grid(1404)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/mj/cmjohiwxqtsa56w5lbbo3aubkq7zy3joom3fttzn7hijt3v2ims3.py | |
| # Original ATen: aten.cat | |
| # aten.cat => cat_58 | |
| triton_poi_fused_cat_125 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_cat_125(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3913728 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 30576 | |
| x1 = (xindex // 30576) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + (122304*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 156, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_cat_125.run(*args, 3913728, grid=grid(3913728), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_cat_125.benchmark_all_configs(*args, 3913728, grid=grid(3913728)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/rn/crn2e5wpsvghyj2fewhvokcjgyf5y4tcls4p5a3zgbxt3jo63qp5.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_612 | |
| triton_poi_fused__to_copy_126 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_126(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 16224 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_126.run(*args, 16224, grid=grid(16224), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_126.benchmark_all_configs(*args, 16224, grid=grid(16224)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/if/cifaxtccprphcf3f7mkgsgusdq7va54imzb4ozr5t5rqz26r4xz3.py | |
| # Original ATen: aten.cat | |
| # aten.cat => cat_59 | |
| triton_poi_fused_cat_127 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_cat_127(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1304576 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 10192 | |
| x1 = (xindex // 10192) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + (20384*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 52, 14, 14), (10192, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 52, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_cat_127.run(*args, 1304576, grid=grid(1304576), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_cat_127.benchmark_all_configs(*args, 1304576, grid=grid(1304576)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/qf/cqfc2p7t5thx2bnsyx6ihjsqv65lizzzkgfndnkunfne6ahlxrbx.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_189 | |
| # aten.add => add_346 | |
| # aten.native_batch_norm_backward => convert_element_type_614, mul_839, sub_206, sum_85, sum_86 | |
| triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_128 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[512, 8192], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: 'i32', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_128(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 416 | |
| rnumel = 6272 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex % 104 | |
| x1 = (xindex // 104) | |
| _tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| x3 = xindex | |
| tmp7 = tl.load(in_ptr3 + (x0), xmask) | |
| _tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| tmp0 = tl.load(in_ptr0 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp5 = tl.load(in_ptr2 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp2 = tmp0 + tmp1 | |
| tmp3 = tmp2.to(tl.float32) | |
| _tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4) | |
| tmp6 = tmp5.to(tl.float32) | |
| tmp8 = tmp6 - tmp7 | |
| tmp9 = tmp3 * tmp8 | |
| _tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10) | |
| tmp4 = tl.sum(_tmp4, 1)[:, None] | |
| tl.store(out_ptr0 + x3, tmp4, xmask) | |
| tmp10 = tl.sum(_tmp10, 1)[:, None] | |
| tl.store(out_ptr1 + x3, tmp10, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_128.run(*args, 416, 6272, grid=grid(416), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_128.benchmark_all_configs(*args, 416, 6272, grid=grid(416)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/55/c55duaucwd5mqi2jcpis5psliz6udgc3zunl4l6gsysumyukuodb.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_189 | |
| # aten.add => add_346 | |
| # aten.native_batch_norm_backward => convert_element_type_614, convert_element_type_616, mul_845, mul_846, sub_206, sub_208, sub_209 | |
| triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_129 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp16', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_129(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2609152 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 196) % 104 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
| tmp4 = tl.load(in_ptr2 + (x3), None).to(tl.float32) | |
| tmp6 = tl.load(in_ptr3 + (x1), None) | |
| tmp8 = tl.load(in_ptr4 + (x1), None) | |
| tmp11 = tl.load(in_ptr5 + (x1), None) | |
| tmp16 = tl.load(in_ptr6 + (x1), None) | |
| tmp19 = tl.load(in_ptr7 + (x1), None) | |
| tmp2 = tmp0 + tmp1 | |
| tmp3 = tmp2.to(tl.float32) | |
| tmp5 = tmp4.to(tl.float32) | |
| tmp7 = tmp5 - tmp6 | |
| tmp9 = 3.985969387755102e-05 | |
| tmp10 = tmp8 * tmp9 | |
| tmp12 = tmp11 * tmp11 | |
| tmp13 = tmp10 * tmp12 | |
| tmp14 = tmp7 * tmp13 | |
| tmp15 = tmp3 - tmp14 | |
| tmp17 = tmp16 * tmp9 | |
| tmp18 = tmp15 - tmp17 | |
| tmp20 = tmp11 * tmp19 | |
| tmp21 = tmp18 * tmp20 | |
| tmp22 = tmp21.to(tl.float32) | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_8 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_129.run(*args, 2609152, grid=grid(2609152), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_129.benchmark_all_configs(*args, 2609152, grid=grid(2609152)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/gi/cgidg4l74j3mxppzfsjtpi42hq4ihs47f6epftki5i5ywvfqu6wv.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_165 | |
| # aten.add => add_346, add_351 | |
| # aten.native_batch_norm_backward => convert_element_type_638, mul_879, sub_222, sum_94, sum_95 | |
| triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_130 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[512, 8192], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_130(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 416 | |
| rnumel = 6272 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex % 104 | |
| x1 = (xindex // 104) | |
| _tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| x3 = xindex | |
| tmp9 = tl.load(in_ptr4 + (x0), xmask) | |
| _tmp12 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| tmp0 = tl.load(in_ptr0 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp3 = tl.load(in_ptr2 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp7 = tl.load(in_ptr3 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp2 = tmp0 + tmp1 | |
| tmp4 = tmp2 + tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| _tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6) | |
| tmp8 = tmp7.to(tl.float32) | |
| tmp10 = tmp8 - tmp9 | |
| tmp11 = tmp5 * tmp10 | |
| _tmp12 = tl.where(rmask & xmask, _tmp12 + tmp11, _tmp12) | |
| tmp6 = tl.sum(_tmp6, 1)[:, None] | |
| tl.store(out_ptr0 + x3, tmp6, xmask) | |
| tmp12 = tl.sum(_tmp12, 1)[:, None] | |
| tl.store(out_ptr1 + x3, tmp12, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_4 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_130.run(*args, 416, 6272, grid=grid(416), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_130.benchmark_all_configs(*args, 416, 6272, grid=grid(416)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/nf/cnfybinrqgwpzwkognoekbtqzfldbno4q6jduhaywjxls5lwqbsd.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_165 | |
| # aten.add => add_346, add_351 | |
| # aten.native_batch_norm_backward => convert_element_type_638, mul_885, mul_886, sub_222, sub_224, sub_225 | |
| triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_131 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_131(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2609152 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 196) % 104 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
| tmp3 = tl.load(in_ptr2 + (x3), None).to(tl.float32) | |
| tmp6 = tl.load(in_ptr3 + (x3), None).to(tl.float32) | |
| tmp8 = tl.load(in_ptr4 + (x1), None) | |
| tmp10 = tl.load(in_ptr5 + (x1), None) | |
| tmp13 = tl.load(in_ptr6 + (x1), None) | |
| tmp18 = tl.load(in_ptr7 + (x1), None) | |
| tmp21 = tl.load(in_ptr8 + (x1), None) | |
| tmp2 = tmp0 + tmp1 | |
| tmp4 = tmp2 + tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tmp7 = tmp6.to(tl.float32) | |
| tmp9 = tmp7 - tmp8 | |
| tmp11 = 3.985969387755102e-05 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = tmp13 * tmp13 | |
| tmp15 = tmp12 * tmp14 | |
| tmp16 = tmp9 * tmp15 | |
| tmp17 = tmp5 - tmp16 | |
| tmp19 = tmp18 * tmp11 | |
| tmp20 = tmp17 - tmp19 | |
| tmp22 = tmp13 * tmp21 | |
| tmp23 = tmp20 * tmp22 | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp23, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_4 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_8 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_9 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_131.run(*args, 2609152, grid=grid(2609152), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_131.benchmark_all_configs(*args, 2609152, grid=grid(2609152)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/e6/ce6moc5iqgt57e7eqjuf5zhxhh5tsl3aixs4uw2xn3bhmzxi4yko.py | |
| # Original ATen: aten.convolution_backward | |
| # aten.convolution_backward => convolution_backward_91 | |
| triton_poi_fused_convolution_backward_132 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_convolution_backward_132(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1304576 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 10192 | |
| x1 = (xindex // 10192) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (10192 + x0 + (20384*x1)), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((128, 52, 14, 14), (10192, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_convolution_backward_132.run(*args, 1304576, grid=grid(1304576), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_convolution_backward_132.benchmark_all_configs(*args, 1304576, grid=grid(1304576)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/dk/cdkheqxbj4lk4wbn4mfiw2m45ufmxv76zkupgnoqujmqz7cf3flk.py | |
| # Original ATen: aten.convolution_backward | |
| # aten.convolution_backward => convolution_backward_92 | |
| triton_poi_fused_convolution_backward_133 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_convolution_backward_133(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1304576 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 10192 | |
| x1 = (xindex // 10192) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (20384*x1)), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((128, 52, 14, 14), (10192, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_convolution_backward_133.run(*args, 1304576, grid=grid(1304576), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_convolution_backward_133.benchmark_all_configs(*args, 1304576, grid=grid(1304576)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/w3/cw3zbb3x7kiu4ifygfhy3eaz6tyephtwgkxcx7rkmsrsvtrbrcix.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_141 | |
| # aten.add => add_346, add_351, add_356 | |
| # aten.native_batch_norm_backward => convert_element_type_662, mul_919, sub_238, sum_103, sum_104 | |
| triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_134 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[512, 8192], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_134(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 416 | |
| rnumel = 6272 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex % 104 | |
| x1 = (xindex // 104) | |
| _tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| x3 = xindex | |
| tmp11 = tl.load(in_ptr5 + (x0), xmask) | |
| _tmp14 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| tmp0 = tl.load(in_ptr0 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp3 = tl.load(in_ptr2 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp5 = tl.load(in_ptr3 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp9 = tl.load(in_ptr4 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp2 = tmp0 + tmp1 | |
| tmp4 = tmp2 + tmp3 | |
| tmp6 = tmp4 + tmp5 | |
| tmp7 = tmp6.to(tl.float32) | |
| _tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8) | |
| tmp10 = tmp9.to(tl.float32) | |
| tmp12 = tmp10 - tmp11 | |
| tmp13 = tmp7 * tmp12 | |
| _tmp14 = tl.where(rmask & xmask, _tmp14 + tmp13, _tmp14) | |
| tmp8 = tl.sum(_tmp8, 1)[:, None] | |
| tl.store(out_ptr0 + x3, tmp8, xmask) | |
| tmp14 = tl.sum(_tmp14, 1)[:, None] | |
| tl.store(out_ptr1 + x3, tmp14, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_4 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_5 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_134.run(*args, 416, 6272, grid=grid(416), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_134.benchmark_all_configs(*args, 416, 6272, grid=grid(416)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ej/cejoyk2b6v7darhrxgzp23ovzotirkownmjt7wscn7uwlctg5oxk.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.convolution_backward, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_141 | |
| # aten.add => add_346, add_351, add_356 | |
| # aten.convolution_backward => convolution_backward_101 | |
| # aten.native_batch_norm_backward => convert_element_type_662, convert_element_type_664, mul_925, mul_926, sub_238, sub_240, sub_241 | |
| triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_135 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: '*fp16', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_135(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, in_ptr9, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2609152 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 196) % 104 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
| tmp3 = tl.load(in_ptr2 + (x3), None).to(tl.float32) | |
| tmp5 = tl.load(in_ptr3 + (x3), None).to(tl.float32) | |
| tmp8 = tl.load(in_ptr4 + (x3), None).to(tl.float32) | |
| tmp10 = tl.load(in_ptr5 + (x1), None) | |
| tmp12 = tl.load(in_ptr6 + (x1), None) | |
| tmp15 = tl.load(in_ptr7 + (x1), None) | |
| tmp20 = tl.load(in_ptr8 + (x1), None) | |
| tmp23 = tl.load(in_ptr9 + (x1), None) | |
| tmp2 = tmp0 + tmp1 | |
| tmp4 = tmp2 + tmp3 | |
| tmp6 = tmp4 + tmp5 | |
| tmp7 = tmp6.to(tl.float32) | |
| tmp9 = tmp8.to(tl.float32) | |
| tmp11 = tmp9 - tmp10 | |
| tmp13 = 3.985969387755102e-05 | |
| tmp14 = tmp12 * tmp13 | |
| tmp16 = tmp15 * tmp15 | |
| tmp17 = tmp14 * tmp16 | |
| tmp18 = tmp11 * tmp17 | |
| tmp19 = tmp7 - tmp18 | |
| tmp21 = tmp20 * tmp13 | |
| tmp22 = tmp19 - tmp21 | |
| tmp24 = tmp15 * tmp23 | |
| tmp25 = tmp22 * tmp24 | |
| tmp26 = tmp25.to(tl.float32) | |
| tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp26, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_4 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_5 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_8 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_9 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_10 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, arg_10, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_135.run(*args, 2609152, grid=grid(2609152), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_135.benchmark_all_configs(*args, 2609152, grid=grid(2609152)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/5h/c5hpdahtkfn4l3dhqtwm7jlr542h6muhvdgkqrxb44tec4smg4fm.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_665 | |
| triton_poi_fused__to_copy_136 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_136(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 34944 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((104, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((104, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_136.run(*args, 34944, grid=grid(34944), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_136.benchmark_all_configs(*args, 34944, grid=grid(34944)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/bc/cbch6pezjexqee4mvwdnar33u6mvs47umwhwas6d2blv7733qelg.py | |
| # Original ATen: aten.mul, aten.sigmoid, aten.sigmoid_backward, aten.silu, aten.sum | |
| # aten.mul => mul_928 | |
| # aten.sigmoid => sigmoid_19 | |
| # aten.sigmoid_backward => convert_element_type_666, convert_element_type_667, convert_element_type_668, mul_930, mul_931, sub_242 | |
| # aten.silu => convert_element_type_132, convert_element_type_133, mul_178, sigmoid_17 | |
| # aten.sum => sum_105 | |
| triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_137 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[65536, 256], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_137(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 43008 | |
| rnumel = 196 | |
| RBLOCK: tl.constexpr = 256 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (r1 + (196*x0)), rmask, other=0).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (r1 + (196*x0)), rmask, other=0).to(tl.float32) | |
| tmp11 = tl.load(in_ptr2 + (x0), None).to(tl.float32) | |
| tmp2 = tmp1.to(tl.float32) | |
| tmp3 = tl.sigmoid(tmp2) | |
| tmp4 = tmp2 * tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tmp6 = tmp0 * tmp5 | |
| tmp8 = tl.where(rmask, tmp6, 0) | |
| tmp9 = tl.sum(tmp8, 1)[:, None] | |
| tmp10 = tmp9.to(tl.float32) | |
| tmp12 = tl.sigmoid(tmp11) | |
| tmp13 = tmp12.to(tl.float32) | |
| tmp14 = 1.0 | |
| tmp15 = tmp14 - tmp13 | |
| tmp16 = tmp13 * tmp15 | |
| tmp17 = tmp10 * tmp16 | |
| tmp18 = tmp17.to(tl.float32) | |
| tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp18, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_137.run(*args, 43008, 196, grid=grid(43008), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_137.benchmark_all_configs(*args, 43008, 196, grid=grid(43008)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/lw/clwggjp2glb6lt7ovt2lc3pzpsocchmesscnc7d3a6zsnanmt43m.py | |
| # Original ATen: aten._to_copy, aten.convolution_backward | |
| # aten._to_copy => convert_element_type_670 | |
| # aten.convolution_backward => sum_106 | |
| triton_per_fused__to_copy_convolution_backward_138 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[512, 128], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__to_copy_convolution_backward_138(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 336 | |
| rnumel = 128 | |
| RBLOCK: tl.constexpr = 128 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (336*r1)), rmask & xmask, other=0).to(tl.float32) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__to_copy_convolution_backward_138.run(*args, 336, 128, grid=grid(336), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__to_copy_convolution_backward_138.benchmark_all_configs(*args, 336, 128, grid=grid(336)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ny/cnyioznughbctzhfqmivqbazouomcdvadukxtk2rxq6q22lk25nc.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_669 | |
| triton_poi_fused__to_copy_139 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_139(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 4704 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((336, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((336, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_139.run(*args, 4704, grid=grid(4704), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_139.benchmark_all_configs(*args, 4704, grid=grid(4704)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/xy/cxy6onpt27zwbftqiiz3rmsdtwpb6ifqj55hnehr2kljkwmm2una.py | |
| # Original ATen: aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub | |
| # aten.add => add_357 | |
| # aten.clone => clone_14 | |
| # aten.fill => full_like_33 | |
| # aten.mul => mul_932, mul_933, mul_934 | |
| # aten.sigmoid => sigmoid_97 | |
| # aten.sub => sub_243 | |
| triton_poi_fused_add_clone_fill_mul_sigmoid_sub_140 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_add_clone_fill_mul_sigmoid_sub_140(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1792 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = 1.0 | |
| tmp4 = tmp3 - tmp2 | |
| tmp5 = tmp1 * tmp4 | |
| tmp6 = tmp5 + tmp3 | |
| tmp7 = tmp2 * tmp6 | |
| tmp8 = tmp0 * tmp7 | |
| tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp8, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_add_clone_fill_mul_sigmoid_sub_140.run(*args, 1792, grid=grid(1792), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_add_clone_fill_mul_sigmoid_sub_140.benchmark_all_configs(*args, 1792, grid=grid(1792)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/af/cafrvpfoaadhdlxhldphg5leeyz2enwfb64ifixaudfchrcwkgns.py | |
| # Original ATen: aten._to_copy, aten.convolution_backward | |
| # aten._to_copy => convert_element_type_672 | |
| # aten.convolution_backward => sum_107 | |
| triton_per_fused__to_copy_convolution_backward_141 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[16, 128], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 3), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__to_copy_convolution_backward_141(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 14 | |
| rnumel = 128 | |
| RBLOCK: tl.constexpr = 128 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (14*r1)), rmask & xmask, other=0).to(tl.float32) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((14,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__to_copy_convolution_backward_141.run(*args, 14, 128, grid=grid(14), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__to_copy_convolution_backward_141.benchmark_all_configs(*args, 14, 128, grid=grid(14)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ub/cubt2wopxldea56edxrzorl54nnvgwjendwqgv23wj6it7xjbynx.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_671 | |
| triton_poi_fused__to_copy_142 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_142(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 4704 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((14, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((14, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_142.run(*args, 4704, grid=grid(4704), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_142.benchmark_all_configs(*args, 4704, grid=grid(4704)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/pw/cpwcwf6uhmujsg7jk3lvfx67kggnjin7oj2locjyrnecqe2jebap.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub | |
| # aten._native_batch_norm_legit_functional => convert_element_type_130 | |
| # aten.add => add_358, add_359 | |
| # aten.clone => clone_13 | |
| # aten.div => div_12 | |
| # aten.fill => full_like_34 | |
| # aten.mul => mul_929, mul_935, mul_936, mul_937 | |
| # aten.native_batch_norm_backward => convert_element_type_673, mul_938, mul_946, sub_245, sum_108, sum_109 | |
| # aten.sigmoid => sigmoid_19, sigmoid_98 | |
| # aten.sub => sub_244 | |
| triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_143 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[512, 32768], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_143(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 336 | |
| rnumel = 25088 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex | |
| _tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| tmp20 = tl.load(in_ptr5 + (x0), xmask) | |
| _tmp23 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 196 | |
| r2 = (rindex // 196) | |
| tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (65856*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (x0 + (336*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp4 = tl.load(in_ptr2 + (x0 + (336*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp8 = tl.load(in_ptr3 + (r1 + (196*x0) + (65856*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp18 = tl.load(in_ptr4 + (r1 + (196*x0) + (65856*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp0 * tmp2 | |
| tmp5 = 196.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = tmp3 + tmp6 | |
| tmp9 = tl.sigmoid(tmp8) | |
| tmp10 = 1.0 | |
| tmp11 = tmp10 - tmp9 | |
| tmp12 = tmp8 * tmp11 | |
| tmp13 = tmp12 + tmp10 | |
| tmp14 = tmp9 * tmp13 | |
| tmp15 = tmp7 * tmp14 | |
| tmp16 = tmp15.to(tl.float32) | |
| _tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17) | |
| tmp19 = tmp18.to(tl.float32) | |
| tmp21 = tmp19 - tmp20 | |
| tmp22 = tmp16 * tmp21 | |
| _tmp23 = tl.where(rmask & xmask, _tmp23 + tmp22, _tmp23) | |
| tmp17 = tl.sum(_tmp17, 1)[:, None] | |
| tl.store(out_ptr0 + x0, tmp17, xmask) | |
| tmp23 = tl.sum(_tmp23, 1)[:, None] | |
| tl.store(out_ptr1 + x0, tmp23, xmask) | |
| tmp24 = tl.load(in_ptr6 + (x0), xmask) | |
| tmp25 = tmp23 * tmp24 | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp25, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_4 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_5 = rand_strided((1, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_8 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_9 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_143.run(*args, 336, 25088, grid=grid(336), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_143.benchmark_all_configs(*args, 336, 25088, grid=grid(336)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/7p/c7psulr34da3bohr63ebglp7ja7h35fmmewyuanlwy56v67tyr3o.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub | |
| # aten._native_batch_norm_legit_functional => convert_element_type_130 | |
| # aten.add => add_358, add_359 | |
| # aten.clone => clone_13 | |
| # aten.div => div_12 | |
| # aten.fill => full_like_34 | |
| # aten.mul => mul_929, mul_935, mul_936, mul_937 | |
| # aten.native_batch_norm_backward => convert_element_type_673, mul_944, sub_245, sub_247, sub_248 | |
| # aten.sigmoid => sigmoid_19, sigmoid_98 | |
| # aten.sub => sub_244 | |
| triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_144 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_144(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 8429568 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x4 = (xindex // 196) | |
| x1 = (xindex // 196) % 336 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32) | |
| tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32) | |
| tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32) | |
| tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32) | |
| tmp19 = tl.load(in_ptr5 + (x1), None) | |
| tmp21 = tl.load(in_ptr6 + (x1), None) | |
| tmp24 = tl.load(in_ptr7 + (x1), None) | |
| tmp29 = tl.load(in_ptr8 + (x1), None) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp0 * tmp2 | |
| tmp5 = 196.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = tmp3 + tmp6 | |
| tmp9 = tl.sigmoid(tmp8) | |
| tmp10 = 1.0 | |
| tmp11 = tmp10 - tmp9 | |
| tmp12 = tmp8 * tmp11 | |
| tmp13 = tmp12 + tmp10 | |
| tmp14 = tmp9 * tmp13 | |
| tmp15 = tmp7 * tmp14 | |
| tmp16 = tmp15.to(tl.float32) | |
| tmp18 = tmp17.to(tl.float32) | |
| tmp20 = tmp18 - tmp19 | |
| tmp22 = 3.985969387755102e-05 | |
| tmp23 = tmp21 * tmp22 | |
| tmp25 = tmp24 * tmp24 | |
| tmp26 = tmp23 * tmp25 | |
| tmp27 = tmp20 * tmp26 | |
| tmp28 = tmp16 - tmp27 | |
| tmp30 = tmp29 * tmp22 | |
| tmp31 = tmp28 - tmp30 | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp31, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_4 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| arg_5 = rand_strided((1, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_8 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_9 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_144.run(*args, 8429568, grid=grid(8429568), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_144.benchmark_all_configs(*args, 8429568, grid=grid(8429568)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/id/cidvqsl7tausuxdym44zowgtyle4qgcqwkmvhutx6ld6q33q6idp.py | |
| # Original ATen: aten.convolution_backward | |
| # aten.convolution_backward => convolution_backward_104 | |
| triton_poi_fused_convolution_backward_145 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_convolution_backward_145(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2809856 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = (xindex // 21952) | |
| x3 = xindex % 21952 | |
| x1 = (xindex // 196) % 112 | |
| x4 = xindex | |
| tmp0 = tl.load(in_ptr0 + (43904 + x3 + (65856*x2)), None) | |
| tmp1 = tl.load(in_ptr1 + (224 + x1), None) | |
| tmp2 = tl.load(in_ptr2 + (224 + x1), None) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp0 * tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((128, 112, 14, 14), (21952, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_convolution_backward_145.run(*args, 2809856, grid=grid(2809856), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_convolution_backward_145.benchmark_all_configs(*args, 2809856, grid=grid(2809856)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/fd/cfdopkndut6mz6wq6bvbwuuulhnrmhdrv5rzyqs6re2kufkzwbyi.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_676 | |
| triton_poi_fused__to_copy_146 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_146(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 5488 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((112, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((112, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_146.run(*args, 5488, grid=grid(5488), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_146.benchmark_all_configs(*args, 5488, grid=grid(5488)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/th/cthuea2iprfudqgjdcjb4l6mtcfp43ogq3yml625vwzkim7u4s2l.py | |
| # Original ATen: aten.convolution_backward | |
| # aten.convolution_backward => convolution_backward_105 | |
| triton_poi_fused_convolution_backward_147 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_convolution_backward_147(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2809856 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = (xindex // 21952) | |
| x3 = xindex % 21952 | |
| x1 = (xindex // 196) % 112 | |
| x4 = xindex | |
| tmp0 = tl.load(in_ptr0 + (21952 + x3 + (65856*x2)), None) | |
| tmp1 = tl.load(in_ptr1 + (112 + x1), None) | |
| tmp2 = tl.load(in_ptr2 + (112 + x1), None) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp0 * tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((128, 112, 14, 14), (21952, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_convolution_backward_147.run(*args, 2809856, grid=grid(2809856), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_convolution_backward_147.benchmark_all_configs(*args, 2809856, grid=grid(2809856)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/3x/c3xs7plvafsbnfj7oqnbllamz4ftcowca37hxufdgln6b4kvegly.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_677 | |
| triton_poi_fused__to_copy_148 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_148(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2800 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((112, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((112, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_148.run(*args, 2800, grid=grid(2800), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_148.benchmark_all_configs(*args, 2800, grid=grid(2800)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/zr/czrwimvnigaum3atz3dnqh3vpqefjj27el4vvkm52xud7pku2qdu.py | |
| # Original ATen: aten.convolution_backward | |
| # aten.convolution_backward => convolution_backward_106 | |
| triton_poi_fused_convolution_backward_149 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_convolution_backward_149(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2809856 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = (xindex // 21952) | |
| x3 = xindex % 21952 | |
| x1 = (xindex // 196) % 112 | |
| x4 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x3 + (65856*x2)), None) | |
| tmp1 = tl.load(in_ptr1 + (x1), None) | |
| tmp2 = tl.load(in_ptr2 + (x1), None) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp0 * tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((128, 112, 14, 14), (21952, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_convolution_backward_149.run(*args, 2809856, grid=grid(2809856), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_convolution_backward_149.benchmark_all_configs(*args, 2809856, grid=grid(2809856)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/2v/c2volwa6uwcetttnruu5jjo66mjrobvup6mcg6k7n4aknrx3iggt.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_678 | |
| triton_poi_fused__to_copy_150 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_150(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1008 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((112, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((112, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_150.run(*args, 1008, grid=grid(1008), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_150.benchmark_all_configs(*args, 1008, grid=grid(1008)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/we/cwefssxf5bvvy2y5bt7gv2qzizpcib7seqbqn2voklitwa4hshmd.py | |
| # Original ATen: aten.cat | |
| # aten.cat => cat_66 | |
| triton_poi_fused_cat_151 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_cat_151(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 11239424 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 87808 | |
| x1 = (xindex // 87808) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + (263424*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 112, 28, 28), (87808, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 112, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_cat_151.run(*args, 11239424, grid=grid(11239424), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_cat_151.benchmark_all_configs(*args, 11239424, grid=grid(11239424)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/qf/cqfo4ck6nf3trupktl4ixfnhezf4tcvnlhrfn63fekgvbdyrr6da.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.mul, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_123 | |
| # aten.mul => mul_949 | |
| # aten.native_batch_norm_backward => convert_element_type_679, mul_950, mul_958, sub_250, sum_110, sum_111 | |
| triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_152 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[512, 131072], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_152(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 336 | |
| rnumel = 100352 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex | |
| _tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| tmp7 = tl.load(in_ptr3 + (x0), xmask) | |
| _tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 784 | |
| r2 = (rindex // 784) | |
| tmp0 = tl.load(in_ptr0 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp5 = tl.load(in_ptr2 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp2 = tmp0 * tmp1 | |
| tmp3 = tmp2.to(tl.float32) | |
| _tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4) | |
| tmp6 = tmp5.to(tl.float32) | |
| tmp8 = tmp6 - tmp7 | |
| tmp9 = tmp3 * tmp8 | |
| _tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10) | |
| tmp4 = tl.sum(_tmp4, 1)[:, None] | |
| tl.store(out_ptr0 + x0, tmp4, xmask) | |
| tmp10 = tl.sum(_tmp10, 1)[:, None] | |
| tl.store(out_ptr1 + x0, tmp10, xmask) | |
| tmp11 = tl.load(in_ptr4 + (x0), xmask) | |
| tmp12 = tmp10 * tmp11 | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp12, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((1, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_152.run(*args, 336, 100352, grid=grid(336), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_152.benchmark_all_configs(*args, 336, 100352, grid=grid(336)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/qc/cqcwwcxya4t45mgfdz46w2lkm3l3dt6ws7mawrdyrpvgbmqxgblv.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.convolution_backward, aten.mul, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_123 | |
| # aten.convolution_backward => convolution_backward_107 | |
| # aten.mul => mul_949 | |
| # aten.native_batch_norm_backward => convert_element_type_679, convert_element_type_681, mul_956, mul_957, sub_250, sub_252, sub_253 | |
| triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_153 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_153(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 33718272 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 784) % 336 | |
| tmp0 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32) | |
| tmp1 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp4 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
| tmp6 = tl.load(in_ptr2 + (x1), None) | |
| tmp8 = tl.load(in_ptr3 + (x1), None) | |
| tmp11 = tl.load(in_ptr4 + (x1), None) | |
| tmp16 = tl.load(in_ptr5 + (x1), None) | |
| tmp19 = tl.load(in_ptr6 + (x1), None) | |
| tmp2 = tmp0 * tmp1 | |
| tmp3 = tmp2.to(tl.float32) | |
| tmp5 = tmp4.to(tl.float32) | |
| tmp7 = tmp5 - tmp6 | |
| tmp9 = 9.964923469387754e-06 | |
| tmp10 = tmp8 * tmp9 | |
| tmp12 = tmp11 * tmp11 | |
| tmp13 = tmp10 * tmp12 | |
| tmp14 = tmp7 * tmp13 | |
| tmp15 = tmp3 - tmp14 | |
| tmp17 = tmp16 * tmp9 | |
| tmp18 = tmp15 - tmp17 | |
| tmp20 = tmp11 * tmp19 | |
| tmp21 = tmp18 * tmp20 | |
| tmp22 = tmp21.to(tl.float32) | |
| tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((1, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_153.run(*args, 33718272, grid=grid(33718272), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_153.benchmark_all_configs(*args, 33718272, grid=grid(33718272)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/xp/cxpyb2d7tgj3ajjcfk5ruyhc5u2uszkeh76gef4njgyf5kz5ukth.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_682 | |
| triton_poi_fused__to_copy_154 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_154(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 18816 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((336, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((336, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_154.run(*args, 18816, grid=grid(18816), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_154.benchmark_all_configs(*args, 18816, grid=grid(18816)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ku/cku7ek7gtwfrqcc34nfk6i7wvwqtzc57tln743w3jpx2qsiyexs7.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_120 | |
| # aten.native_batch_norm_backward => convert_element_type_683, mul_959, sub_254, sum_112, sum_113 | |
| triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_155 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[1024, 8192], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_155(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 728 | |
| rnumel = 7720 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x1 = (xindex // 56) | |
| x0 = xindex % 56 | |
| _tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| x3 = xindex | |
| _tmp13 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| tmp0 = r2 + (7720*x1) | |
| tmp1 = 100352 | |
| tmp2 = tmp0 < tmp1 | |
| tmp3 = tl.load(in_ptr0 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp4 = tmp3.to(tl.float32) | |
| tmp5 = tl.where(tmp2, tmp4, 0) | |
| _tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6) | |
| tmp7 = tl.load(in_ptr1 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp8 = tmp7.to(tl.float32) | |
| tmp9 = tl.load(in_ptr2 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0) | |
| tmp10 = tmp8 - tmp9 | |
| tmp11 = tmp4 * tmp10 | |
| tmp12 = tl.where(tmp2, tmp11, 0) | |
| _tmp13 = tl.where(rmask & xmask, _tmp13 + tmp12, _tmp13) | |
| tmp6 = tl.sum(_tmp6, 1)[:, None] | |
| tl.store(out_ptr0 + x3, tmp6, xmask) | |
| tmp13 = tl.sum(_tmp13, 1)[:, None] | |
| tl.store(out_ptr1 + x3, tmp13, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((1, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_155.run(*args, 728, 7720, grid=grid(728), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_155.benchmark_all_configs(*args, 728, 7720, grid=grid(728)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/uy/cuyabiwct4xwna22y54u6ef6hgcysgudgvw53xgcxhoetvhdud3c.py | |
| # Original ATen: aten.native_batch_norm_backward | |
| # aten.native_batch_norm_backward => convert_element_type_683, sum_112 | |
| triton_per_fused_native_batch_norm_backward_156 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[64, 16], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused_native_batch_norm_backward_156(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 56 | |
| rnumel = 13 | |
| RBLOCK: tl.constexpr = 16 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (56*r1)), rmask & xmask, other=0) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tl.store(out_ptr0 + x0, tmp3, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused_native_batch_norm_backward_156.run(*args, 56, 13, grid=grid(56), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused_native_batch_norm_backward_156.benchmark_all_configs(*args, 56, 13, grid=grid(56)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/on/convc5p74ddudqk46e52onvmx3rxjfktebgzjuhxneits4eo7nyc.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_120 | |
| # aten.native_batch_norm_backward => convert_element_type_683, mul_959, mul_967, sub_254, sum_113 | |
| triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_157 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[64, 16], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_157(in_ptr0, in_ptr1, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 56 | |
| rnumel = 13 | |
| RBLOCK: tl.constexpr = 16 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (56*r1)), rmask & xmask, other=0) | |
| tmp4 = tl.load(in_ptr1 + (x0), xmask) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp5 = tmp3 * tmp4 | |
| tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask) | |
| tl.store(out_ptr0 + x0, tmp3, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_157.run(*args, 56, 13, grid=grid(56), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_157.benchmark_all_configs(*args, 56, 13, grid=grid(56)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/bj/cbjvyothjvn62eovj2qvbnhqdnxtnyarsgyu3rshwon5wrxfm6me.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_120 | |
| # aten.native_batch_norm_backward => convert_element_type_683, convert_element_type_685, mul_965, mul_966, sub_254, sub_256, sub_257 | |
| triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_158 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp16', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_158(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 5619712 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 784) % 56 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp2 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
| tmp4 = tl.load(in_ptr2 + (x1), None) | |
| tmp6 = tl.load(in_ptr3 + (x1), None) | |
| tmp9 = tl.load(in_ptr4 + (x1), None) | |
| tmp14 = tl.load(in_ptr5 + (x1), None) | |
| tmp17 = tl.load(in_ptr6 + (x1), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tmp3 = tmp2.to(tl.float32) | |
| tmp5 = tmp3 - tmp4 | |
| tmp7 = 9.964923469387754e-06 | |
| tmp8 = tmp6 * tmp7 | |
| tmp10 = tmp9 * tmp9 | |
| tmp11 = tmp8 * tmp10 | |
| tmp12 = tmp5 * tmp11 | |
| tmp13 = tmp1 - tmp12 | |
| tmp15 = tmp14 * tmp7 | |
| tmp16 = tmp13 - tmp15 | |
| tmp18 = tmp9 * tmp17 | |
| tmp19 = tmp16 * tmp18 | |
| tmp20 = tmp19.to(tl.float32) | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp20, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((1, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_158.run(*args, 5619712, grid=grid(5619712), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_158.benchmark_all_configs(*args, 5619712, grid=grid(5619712)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ip/cipdkjyk2onxl3m3jynzjqksmm63qjqd3qbbbq47kgdhlig5ktev.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_686 | |
| triton_poi_fused__to_copy_159 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_159(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 4704 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_159.run(*args, 4704, grid=grid(4704), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_159.benchmark_all_configs(*args, 4704, grid=grid(4704)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/hk/chko7h4vzts7m5h2r3fcixp646xti7wwu6pppgto4mg3xqc3ieno.py | |
| # Original ATen: aten.cat | |
| # aten.cat => cat_67 | |
| triton_poi_fused_cat_160 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_cat_160(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 16859136 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 131712 | |
| x1 = (xindex // 131712) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + (263424*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 168, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_cat_160.run(*args, 16859136, grid=grid(16859136), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_cat_160.benchmark_all_configs(*args, 16859136, grid=grid(16859136)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/wf/cwfwcfiqllapezhy4wsh7mb3i7a2codkip2cnbeie26v347spvpq.py | |
| # Original ATen: aten.mul, aten.sigmoid, aten.sigmoid_backward, aten.silu, aten.sum | |
| # aten.mul => mul_968 | |
| # aten.sigmoid => sigmoid_15 | |
| # aten.sigmoid_backward => convert_element_type_688, convert_element_type_689, convert_element_type_690, mul_970, mul_971, sub_258 | |
| # aten.silu => convert_element_type_110, convert_element_type_111, mul_153, sigmoid_13 | |
| # aten.sum => sum_114 | |
| triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_161 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[65536, 1024], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_161(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 43008 | |
| rnumel = 784 | |
| RBLOCK: tl.constexpr = 1024 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (r1 + (784*x0)), rmask, other=0).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (r1 + (784*x0)), rmask, other=0).to(tl.float32) | |
| tmp11 = tl.load(in_ptr2 + (x0), None).to(tl.float32) | |
| tmp2 = tmp1.to(tl.float32) | |
| tmp3 = tl.sigmoid(tmp2) | |
| tmp4 = tmp2 * tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tmp6 = tmp0 * tmp5 | |
| tmp8 = tl.where(rmask, tmp6, 0) | |
| tmp9 = tl.sum(tmp8, 1)[:, None] | |
| tmp10 = tmp9.to(tl.float32) | |
| tmp12 = tl.sigmoid(tmp11) | |
| tmp13 = tmp12.to(tl.float32) | |
| tmp14 = 1.0 | |
| tmp15 = tmp14 - tmp13 | |
| tmp16 = tmp13 * tmp15 | |
| tmp17 = tmp10 * tmp16 | |
| tmp18 = tmp17.to(tl.float32) | |
| tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp18, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_161.run(*args, 43008, 784, grid=grid(43008), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_161.benchmark_all_configs(*args, 43008, 784, grid=grid(43008)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/h6/ch66mav6cpspgg34bc2ckyogaizco43c7rwo2n4rkshr5666vpid.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_691 | |
| triton_poi_fused__to_copy_162 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_162(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 9408 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((336, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((336, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_162.run(*args, 9408, grid=grid(9408), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_162.benchmark_all_configs(*args, 9408, grid=grid(9408)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/fi/cfibawhd6jznqxl7q2atcugp7zx43kt5jtkntbghy42ljr2ytlun.py | |
| # Original ATen: aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub | |
| # aten.add => add_361 | |
| # aten.clone => clone_11 | |
| # aten.fill => full_like_36 | |
| # aten.mul => mul_972, mul_973, mul_974 | |
| # aten.sigmoid => sigmoid_100 | |
| # aten.sub => sub_259 | |
| triton_poi_fused_add_clone_fill_mul_sigmoid_sub_163 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_add_clone_fill_mul_sigmoid_sub_163(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 3584 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = 1.0 | |
| tmp4 = tmp3 - tmp2 | |
| tmp5 = tmp1 * tmp4 | |
| tmp6 = tmp5 + tmp3 | |
| tmp7 = tmp2 * tmp6 | |
| tmp8 = tmp0 * tmp7 | |
| tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp8, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_add_clone_fill_mul_sigmoid_sub_163.run(*args, 3584, grid=grid(3584), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_add_clone_fill_mul_sigmoid_sub_163.benchmark_all_configs(*args, 3584, grid=grid(3584)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/yv/cyvint5v6fejjcxz54v5axwxdzytfpjmq7eoqsppyu45zsqim3l2.py | |
| # Original ATen: aten._to_copy, aten.convolution_backward | |
| # aten._to_copy => convert_element_type_694 | |
| # aten.convolution_backward => sum_116 | |
| triton_per_fused__to_copy_convolution_backward_164 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import persistent_reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @persistent_reduction( | |
| size_hints=[32, 128], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 3), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_per_fused__to_copy_convolution_backward_164(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
| xnumel = 28 | |
| rnumel = 128 | |
| RBLOCK: tl.constexpr = 128 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rindex = tl.arange(0, RBLOCK)[None, :] | |
| rmask = rindex < rnumel | |
| r1 = rindex | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (28*r1)), rmask & xmask, other=0).to(tl.float32) | |
| tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
| tmp3 = tl.sum(tmp2, 1)[:, None] | |
| tmp4 = tmp3.to(tl.float32) | |
| tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((28,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_per_fused__to_copy_convolution_backward_164.run(*args, 28, 128, grid=grid(28), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_per_fused__to_copy_convolution_backward_164.benchmark_all_configs(*args, 28, 128, grid=grid(28)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/pk/cpkfpj7moawggcqkv5ok3cwxbz2nv6myxthuduj53xnw6qw3dwig.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_693 | |
| triton_poi_fused__to_copy_165 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_165(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 9408 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((28, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((28, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_165.run(*args, 9408, grid=grid(9408), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_165.benchmark_all_configs(*args, 9408, grid=grid(9408)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/q7/cq7zl54kvzo64w2eq2dfuf5wrsv6grvd6fo3lgq77a5kcc7cs6ek.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub | |
| # aten._native_batch_norm_legit_functional => convert_element_type_108 | |
| # aten.add => add_362, add_363 | |
| # aten.clone => clone_10 | |
| # aten.div => div_13 | |
| # aten.fill => full_like_37 | |
| # aten.mul => mul_969, mul_975, mul_976, mul_977 | |
| # aten.native_batch_norm_backward => convert_element_type_695, mul_978, mul_986, sub_261, sum_117, sum_118 | |
| # aten.sigmoid => sigmoid_101, sigmoid_15 | |
| # aten.sub => sub_260 | |
| triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_166 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[512, 131072], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_166(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 336 | |
| rnumel = 100352 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x0 = xindex | |
| _tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| tmp20 = tl.load(in_ptr5 + (x0), xmask) | |
| _tmp23 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r1 = rindex % 784 | |
| r2 = (rindex // 784) | |
| tmp0 = tl.load(in_ptr0 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (x0 + (336*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp4 = tl.load(in_ptr2 + (x0 + (336*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp8 = tl.load(in_ptr3 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp18 = tl.load(in_ptr4 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp0 * tmp2 | |
| tmp5 = 784.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = tmp3 + tmp6 | |
| tmp9 = tl.sigmoid(tmp8) | |
| tmp10 = 1.0 | |
| tmp11 = tmp10 - tmp9 | |
| tmp12 = tmp8 * tmp11 | |
| tmp13 = tmp12 + tmp10 | |
| tmp14 = tmp9 * tmp13 | |
| tmp15 = tmp7 * tmp14 | |
| tmp16 = tmp15.to(tl.float32) | |
| _tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17) | |
| tmp19 = tmp18.to(tl.float32) | |
| tmp21 = tmp19 - tmp20 | |
| tmp22 = tmp16 * tmp21 | |
| _tmp23 = tl.where(rmask & xmask, _tmp23 + tmp22, _tmp23) | |
| tmp17 = tl.sum(_tmp17, 1)[:, None] | |
| tl.store(out_ptr0 + x0, tmp17, xmask) | |
| tmp23 = tl.sum(_tmp23, 1)[:, None] | |
| tl.store(out_ptr1 + x0, tmp23, xmask) | |
| tmp24 = tl.load(in_ptr6 + (x0), xmask) | |
| tmp25 = tmp23 * tmp24 | |
| tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp25, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_4 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_5 = rand_strided((1, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_8 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_9 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_166.run(*args, 336, 100352, grid=grid(336), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_166.benchmark_all_configs(*args, 336, 100352, grid=grid(336)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/rc/crcc3il2qglpofbslbuwlevrp5dihafxcr4wxlvjfxaasqpi56hu.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub | |
| # aten._native_batch_norm_legit_functional => convert_element_type_108 | |
| # aten.add => add_362, add_363 | |
| # aten.clone => clone_10 | |
| # aten.div => div_13 | |
| # aten.fill => full_like_37 | |
| # aten.mul => mul_969, mul_975, mul_976, mul_977 | |
| # aten.native_batch_norm_backward => convert_element_type_695, mul_984, sub_261, sub_263, sub_264 | |
| # aten.sigmoid => sigmoid_101, sigmoid_15 | |
| # aten.sub => sub_260 | |
| triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_167 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_167(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 33718272 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x4 = (xindex // 784) | |
| x1 = (xindex // 784) % 336 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32) | |
| tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32) | |
| tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32) | |
| tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32) | |
| tmp19 = tl.load(in_ptr5 + (x1), None) | |
| tmp21 = tl.load(in_ptr6 + (x1), None) | |
| tmp24 = tl.load(in_ptr7 + (x1), None) | |
| tmp29 = tl.load(in_ptr8 + (x1), None) | |
| tmp2 = tl.sigmoid(tmp1) | |
| tmp3 = tmp0 * tmp2 | |
| tmp5 = 784.0 | |
| tmp6 = tmp4 / tmp5 | |
| tmp7 = tmp3 + tmp6 | |
| tmp9 = tl.sigmoid(tmp8) | |
| tmp10 = 1.0 | |
| tmp11 = tmp10 - tmp9 | |
| tmp12 = tmp8 * tmp11 | |
| tmp13 = tmp12 + tmp10 | |
| tmp14 = tmp9 * tmp13 | |
| tmp15 = tmp7 * tmp14 | |
| tmp16 = tmp15.to(tl.float32) | |
| tmp18 = tmp17.to(tl.float32) | |
| tmp20 = tmp18 - tmp19 | |
| tmp22 = 9.964923469387754e-06 | |
| tmp23 = tmp21 * tmp22 | |
| tmp25 = tmp24 * tmp24 | |
| tmp26 = tmp23 * tmp25 | |
| tmp27 = tmp20 * tmp26 | |
| tmp28 = tmp16 - tmp27 | |
| tmp30 = tmp29 * tmp22 | |
| tmp31 = tmp28 - tmp30 | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp31, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_4 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_5 = rand_strided((1, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_8 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_9 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_167.run(*args, 33718272, grid=grid(33718272), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_167.benchmark_all_configs(*args, 33718272, grid=grid(33718272)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/7y/c7y3vo54x6g4hqnvyjz4k3ngokc66wmd3u7zsbjrmhorsy53o4sb.py | |
| # Original ATen: aten.convolution_backward | |
| # aten.convolution_backward => convolution_backward_112 | |
| triton_poi_fused_convolution_backward_168 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_convolution_backward_168(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 16859136 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = (xindex // 131712) | |
| x3 = xindex % 131712 | |
| x1 = (xindex // 784) % 168 | |
| x4 = xindex | |
| tmp0 = tl.load(in_ptr0 + (131712 + x3 + (263424*x2)), None) | |
| tmp1 = tl.load(in_ptr1 + (168 + x1), None) | |
| tmp2 = tl.load(in_ptr2 + (168 + x1), None) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp0 * tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_convolution_backward_168.run(*args, 16859136, grid=grid(16859136), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_convolution_backward_168.benchmark_all_configs(*args, 16859136, grid=grid(16859136)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/6n/c6nredjp54hkor7tbo72xhpqqat3vpewekmkr4kkyypw6cgqm6ai.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_698 | |
| triton_poi_fused__to_copy_169 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_169(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 4200 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((168, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((168, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_169.run(*args, 4200, grid=grid(4200), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_169.benchmark_all_configs(*args, 4200, grid=grid(4200)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ia/ciabi2inw7dvbjd5nathz3xfqaodfqjylvjwgsdeimxatolwwqks.py | |
| # Original ATen: aten.convolution_backward | |
| # aten.convolution_backward => convolution_backward_113 | |
| triton_poi_fused_convolution_backward_170 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_convolution_backward_170(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 16859136 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = (xindex // 131712) | |
| x3 = xindex % 131712 | |
| x1 = (xindex // 784) % 168 | |
| x4 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x3 + (263424*x2)), None) | |
| tmp1 = tl.load(in_ptr1 + (x1), None) | |
| tmp2 = tl.load(in_ptr2 + (x1), None) | |
| tmp3 = tmp1 * tmp2 | |
| tmp4 = tmp0 * tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_3 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_convolution_backward_170.run(*args, 16859136, grid=grid(16859136), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_convolution_backward_170.benchmark_all_configs(*args, 16859136, grid=grid(16859136)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/c4/cc4n2v6wqwhmmrx6e37v63wrmsqntez34qcciaujxy6fxsa2gk3o.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_699 | |
| triton_poi_fused__to_copy_171 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_171(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 1512 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((168, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((168, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_171.run(*args, 1512, grid=grid(1512), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_171.benchmark_all_configs(*args, 1512, grid=grid(1512)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ak/cakbaw2uk6hhpcroelbhhfo57dmqlxwcgkogm2m6ds3sv57t2vqz.py | |
| # Original ATen: aten._to_copy | |
| # aten._to_copy => convert_element_type_703 | |
| triton_poi_fused__to_copy_172 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__to_copy_172(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 4704 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__to_copy_172.run(*args, 4704, grid=grid(4704), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__to_copy_172.benchmark_all_configs(*args, 4704, grid=grid(4704)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/ow/cowu2kdmjh4mnuy3o2jt5hyxyjhq6tnzxq4b3gah2vdhgpih4okk.py | |
| # Original ATen: aten.cat | |
| # aten.cat => cat_69 | |
| triton_poi_fused_cat_173 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_cat_173(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2809856 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x2 = xindex | |
| x0 = xindex % 21952 | |
| x1 = (xindex // 21952) | |
| tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
| tl.store(out_ptr0 + (x0 + (43904*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 28, 28, 28), (21952, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 28, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_cat_173.run(*args, 2809856, grid=grid(2809856), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_cat_173.benchmark_all_configs(*args, 2809856, grid=grid(2809856)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/tu/ctuwkh7mmjcoxpqhlozf53j6kx7nu6pcnpqqmwor4d57vrqdsosn.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_98 | |
| # aten.add => add_365 | |
| # aten.native_batch_norm_backward => convert_element_type_705, mul_999, sub_270, sum_121, sum_122 | |
| triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_174 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[1024, 8192], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: 'i32', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_174(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 728 | |
| rnumel = 7720 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x1 = (xindex // 56) | |
| x0 = xindex % 56 | |
| _tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| x3 = xindex | |
| _tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| tmp0 = r2 + (7720*x1) | |
| tmp1 = 100352 | |
| tmp2 = tmp0 < tmp1 | |
| tmp3 = tl.load(in_ptr0 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp4 = tl.load(in_ptr1 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp5 = tmp3 + tmp4 | |
| tmp6 = tmp5.to(tl.float32) | |
| tmp7 = tl.where(tmp2, tmp6, 0) | |
| _tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8) | |
| tmp9 = tl.load(in_ptr2 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp10 = tmp9.to(tl.float32) | |
| tmp11 = tl.load(in_ptr3 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0) | |
| tmp12 = tmp10 - tmp11 | |
| tmp13 = tmp6 * tmp12 | |
| tmp14 = tl.where(tmp2, tmp13, 0) | |
| _tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15) | |
| tmp8 = tl.sum(_tmp8, 1)[:, None] | |
| tl.store(out_ptr0 + x3, tmp8, xmask) | |
| tmp15 = tl.sum(_tmp15, 1)[:, None] | |
| tl.store(out_ptr1 + x3, tmp15, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((1, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_174.run(*args, 728, 7720, grid=grid(728), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_174.benchmark_all_configs(*args, 728, 7720, grid=grid(728)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/3q/c3qu2jno53qez6kspx5ta57fq4vavwnpqfdqbctsz2h5gfllev2o.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_98 | |
| # aten.add => add_365 | |
| # aten.native_batch_norm_backward => convert_element_type_705, convert_element_type_707, mul_1005, mul_1006, sub_270, sub_272, sub_273 | |
| triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_175 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp16', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_175(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 5619712 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 784) % 56 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
| tmp4 = tl.load(in_ptr2 + (x3), None).to(tl.float32) | |
| tmp6 = tl.load(in_ptr3 + (x1), None) | |
| tmp8 = tl.load(in_ptr4 + (x1), None) | |
| tmp11 = tl.load(in_ptr5 + (x1), None) | |
| tmp16 = tl.load(in_ptr6 + (x1), None) | |
| tmp19 = tl.load(in_ptr7 + (x1), None) | |
| tmp2 = tmp0 + tmp1 | |
| tmp3 = tmp2.to(tl.float32) | |
| tmp5 = tmp4.to(tl.float32) | |
| tmp7 = tmp5 - tmp6 | |
| tmp9 = 9.964923469387754e-06 | |
| tmp10 = tmp8 * tmp9 | |
| tmp12 = tmp11 * tmp11 | |
| tmp13 = tmp10 * tmp12 | |
| tmp14 = tmp7 * tmp13 | |
| tmp15 = tmp3 - tmp14 | |
| tmp17 = tmp16 * tmp9 | |
| tmp18 = tmp15 - tmp17 | |
| tmp20 = tmp11 * tmp19 | |
| tmp21 = tmp18 * tmp20 | |
| tmp22 = tmp21.to(tl.float32) | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((1, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_4 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_8 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_175.run(*args, 5619712, grid=grid(5619712), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_175.benchmark_all_configs(*args, 5619712, grid=grid(5619712)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/he/cheuhrmgcxe3wl2kq52bt7crzek5t4ji6b56jrybsdffzrqao7el.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_76 | |
| # aten.add => add_365, add_370 | |
| # aten.native_batch_norm_backward => convert_element_type_727, mul_1039, sub_286, sum_130, sum_131 | |
| triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_176 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import reduction | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @reduction( | |
| size_hints=[1024, 8192], | |
| reduction_hint=ReductionHint.INNER, | |
| filename=__file__, | |
| meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]} | |
| ) | |
| @triton.jit | |
| def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_176(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
| xnumel = 728 | |
| rnumel = 7720 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| xmask = xindex < xnumel | |
| rbase = tl.arange(0, RBLOCK)[None, :] | |
| x1 = (xindex // 56) | |
| x0 = xindex % 56 | |
| _tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| x3 = xindex | |
| _tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
| for roffset in range(0, rnumel, RBLOCK): | |
| rindex = roffset + rbase | |
| rmask = rindex < rnumel | |
| r2 = rindex | |
| tmp0 = r2 + (7720*x1) | |
| tmp1 = 100352 | |
| tmp2 = tmp0 < tmp1 | |
| tmp3 = tl.load(in_ptr0 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp4 = tl.load(in_ptr1 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp5 = tmp3 + tmp4 | |
| tmp6 = tl.load(in_ptr2 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp7 = tmp5 + tmp6 | |
| tmp8 = tmp7.to(tl.float32) | |
| tmp9 = tl.where(tmp2, tmp8, 0) | |
| _tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10) | |
| tmp11 = tl.load(in_ptr3 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
| tmp12 = tmp11.to(tl.float32) | |
| tmp13 = tl.load(in_ptr4 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0) | |
| tmp14 = tmp12 - tmp13 | |
| tmp15 = tmp8 * tmp14 | |
| tmp16 = tl.where(tmp2, tmp15, 0) | |
| _tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17) | |
| tmp10 = tl.sum(_tmp10, 1)[:, None] | |
| tl.store(out_ptr0 + x3, tmp10, xmask) | |
| tmp17 = tl.sum(_tmp17, 1)[:, None] | |
| tl.store(out_ptr1 + x3, tmp17, xmask) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_4 = rand_strided((1, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_176.run(*args, 728, 7720, grid=grid(728), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_176.benchmark_all_configs(*args, 728, 7720, grid=grid(728)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/6w/c6wk3ggkhqc4dpm4svd2nw2x7bsbl6xzqqaoct6gtlyubuviyxj6.py | |
| # Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
| # aten._native_batch_norm_legit_functional => convert_element_type_76 | |
| # aten.add => add_365, add_370 | |
| # aten.native_batch_norm_backward => convert_element_type_727, mul_1045, mul_1046, sub_286, sub_288, sub_289 | |
| triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_177 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_177(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 5619712 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x3 = xindex | |
| x1 = (xindex // 784) % 56 | |
| tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
| tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
| tmp3 = tl.load(in_ptr2 + (x3), None).to(tl.float32) | |
| tmp6 = tl.load(in_ptr3 + (x3), None).to(tl.float32) | |
| tmp8 = tl.load(in_ptr4 + (x1), None) | |
| tmp10 = tl.load(in_ptr5 + (x1), None) | |
| tmp13 = tl.load(in_ptr6 + (x1), None) | |
| tmp18 = tl.load(in_ptr7 + (x1), None) | |
| tmp21 = tl.load(in_ptr8 + (x1), None) | |
| tmp2 = tmp0 + tmp1 | |
| tmp4 = tmp2 + tmp3 | |
| tmp5 = tmp4.to(tl.float32) | |
| tmp7 = tmp6.to(tl.float32) | |
| tmp9 = tmp7 - tmp8 | |
| tmp11 = 9.964923469387754e-06 | |
| tmp12 = tmp10 * tmp11 | |
| tmp14 = tmp13 * tmp13 | |
| tmp15 = tmp12 * tmp14 | |
| tmp16 = tmp9 * tmp15 | |
| tmp17 = tmp5 - tmp16 | |
| tmp19 = tmp18 * tmp11 | |
| tmp20 = tmp17 - tmp19 | |
| tmp22 = tmp13 * tmp21 | |
| tmp23 = tmp20 * tmp22 | |
| tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp23, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_1 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_2 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_3 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| arg_4 = rand_strided((1, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
| arg_5 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_6 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_7 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_8 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
| arg_9 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float32) | |
| return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_177.run(*args, 5619712, grid=grid(5619712), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_177.benchmark_all_configs(*args, 5619712, grid=grid(5619712)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/za/czavvoeeqv63efdv32prg5yimwonjxhjdak5qwnkzrdkukuwv7ls.py | |
| # Original ATen: aten.convolution_backward | |
| # aten.convolution_backward => convolution_backward_124 | |
| triton_poi_fused_convolution_backward_178 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_convolution_backward_178(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2809856 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 21952 | |
| x1 = (xindex // 21952) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (21952 + x0 + (43904*x1)), None) | |
| tmp1 = tmp0.to(tl.float32) | |
| tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, None) | |
| def get_args(): | |
| arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float32) | |
| arg_1 = rand_strided((128, 28, 28, 28), (21952, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
| return arg_0, arg_1, | |
| def call(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| stream0 = get_cuda_stream(0) | |
| triton_poi_fused_convolution_backward_178.run(*args, 2809856, grid=grid(2809856), stream=stream0) | |
| def benchmark_all_configs(args): | |
| with torch.cuda._DeviceGuard(0): | |
| torch.cuda.set_device(0) | |
| return triton_poi_fused_convolution_backward_178.benchmark_all_configs(*args, 2809856, grid=grid(2809856)) | |
| if __name__ == '__main__': | |
| from torch._inductor.utils import get_num_bytes | |
| from triton.testing import do_bench | |
| args = get_args() | |
| ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
| num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
| gb_per_s = num_gb / (ms / 1e3) | |
| print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
| ''') | |
| # kernel path: /tmp/torchinductor_shunting/4j/c4jmqpkqr4fnrmxwsxtkllpzeruqpmftklalesh626aid62ltfs5.py | |
| # Original ATen: aten.convolution_backward | |
| # aten.convolution_backward => convolution_backward_125 | |
| triton_poi_fused_convolution_backward_179 = async_compile.triton(''' | |
| import triton | |
| import triton.language as tl | |
| from torch._inductor.ir import ReductionHint | |
| from torch._inductor.ir import TileHint | |
| from torch._inductor.triton_heuristics import pointwise | |
| from torch._inductor.utils import instance_descriptor | |
| from torch._dynamo.testing import rand_strided | |
| from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
| import torch | |
| from torch._inductor.triton_heuristics import grid | |
| @pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
| @triton.jit | |
| def triton_poi_fused_convolution_backward_179(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| xnumel = 2809856 | |
| xoffset = tl.program_id(0) * XBLOCK | |
| xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| xmask = xindex < xnumel | |
| x0 = xindex % 21952 | |
| x1 = (xindex // 21952) | |
| x2 = xindex | |
| tmp0 = tl.load(in_ptr0 + (x0 + (43904*x1)), None) | |
| tmp1 = tmp0.to(tl.float32) | |