Created
April 10, 2023 20:17
-
-
Save shunting314/48efc83b12ec3ead950052e4a0220b10 to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from ctypes import c_void_p, c_long | |
import torch | |
import math | |
import random | |
import os | |
import tempfile | |
from torch._inductor.utils import maybe_profile | |
from torch import empty_strided, as_strided, device | |
from torch._inductor.codecache import AsyncCompile | |
from torch._inductor.select_algorithm import extern_kernels | |
aten = torch.ops.aten | |
assert_size_stride = torch._C._dynamo.guards.assert_size_stride | |
async_compile = AsyncCompile() | |
import triton | |
import triton.language as tl | |
from torch._inductor.triton_heuristics import grid, start_graph, end_graph | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
# kernel path: /tmp/torchinductor_shunting/qd/cqdbopslmbarnc23s33ybyghfqf2olis2okrueqdtwzk66npkoci.py | |
# Original ATen: aten._to_copy, aten.sum | |
# aten._to_copy => convert_element_type_403 | |
# aten.sum => sum_1 | |
triton_red_fused__to_copy_sum_0 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[1024, 128], | |
reduction_hint=ReductionHint.OUTER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 3), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__to_copy_sum_0(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 1000 | |
rnumel = 128 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp1 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (1000*r1)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
_tmp1 = tl.where(rmask & xmask, _tmp1 + tmp0, _tmp1) | |
tmp1 = tl.sum(_tmp1, 1)[:, None] | |
tmp2 = tmp1.to(tl.float32) | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp2, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 1000), (1000, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1000,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__to_copy_sum_0.run(*args, 1000, 128, grid=grid(1000), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__to_copy_sum_0.benchmark_all_configs(*args, 1000, 128, grid=grid(1000)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/v4/cv4lm4gdpfxwjwbfwkrkjjepglxb33ctw5byvwc2ujd3ynyqgfob.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_402 | |
triton_poi_fused__to_copy_1 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_1(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1536000 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None) | |
def get_args(): | |
arg_0 = rand_strided((1000, 1536), (1536, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1000, 1536), (1536, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_1.run(*args, 1536000, grid=grid(1536000), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_1.benchmark_all_configs(*args, 1536000, grid=grid(1536000)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/gw/cgw4eilm5jaevvb4jlwkfuyolkwuloinsrcuxsoiruu5pucw2qoo.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.div, aten.native_batch_norm_backward, aten.threshold_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_398 | |
# aten.div => div | |
# aten.native_batch_norm_backward => convert_element_type_404, mul_470, mul_478, sub_58, sum_2, sum_3 | |
# aten.threshold_backward => scalar_tensor, where | |
triton_red_fused__native_batch_norm_legit_functional_div_native_batch_norm_backward_threshold_backward_2 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[2048, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*i1', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_div_native_batch_norm_backward_threshold_backward_2(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 1536 | |
rnumel = 6272 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp7 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
tmp10 = tl.load(in_ptr3 + (x0), xmask) | |
_tmp13 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 49 | |
r2 = (rindex // 49) | |
tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (75264*r2)), rmask & xmask, eviction_policy='evict_last') | |
tmp2 = tl.load(in_ptr1 + (x0 + (1536*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp8 = tl.load(in_ptr2 + (r1 + (49*x0) + (75264*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = 0.0 | |
tmp3 = 49.0 | |
tmp4 = tmp2 / tmp3 | |
tmp5 = tl.where(tmp0, tmp1, tmp4) | |
tmp6 = tmp5.to(tl.float32) | |
_tmp7 = tl.where(rmask & xmask, _tmp7 + tmp6, _tmp7) | |
tmp9 = tmp8.to(tl.float32) | |
tmp11 = tmp9 - tmp10 | |
tmp12 = tmp6 * tmp11 | |
_tmp13 = tl.where(rmask & xmask, _tmp13 + tmp12, _tmp13) | |
tmp7 = tl.sum(_tmp7, 1)[:, None] | |
tl.store(out_ptr0 + x0, tmp7, xmask) | |
tmp13 = tl.sum(_tmp13, 1)[:, None] | |
tl.store(out_ptr1 + x0, tmp13, xmask) | |
tmp14 = tl.load(in_ptr4 + (x0), xmask) | |
tmp15 = tmp13 * tmp14 | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp15, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda:0', dtype=torch.bool) | |
arg_1 = rand_strided((128, 1536), (1536, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 1536, 1, 1), (1536, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_div_native_batch_norm_backward_threshold_backward_2.run(*args, 1536, 6272, grid=grid(1536), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_div_native_batch_norm_backward_threshold_backward_2.benchmark_all_configs(*args, 1536, 6272, grid=grid(1536)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/kf/ckfaohmrzufzdx2fsmml5navfg2efvvxodj5sf5pujefshnggoot.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.convolution_backward, aten.div, aten.native_batch_norm_backward, aten.threshold_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_398 | |
# aten.convolution_backward => convolution_backward | |
# aten.div => div | |
# aten.native_batch_norm_backward => convert_element_type_404, convert_element_type_406, mul_476, mul_477, sub_58, sub_60, sub_61 | |
# aten.threshold_backward => scalar_tensor, where | |
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_div_native_batch_norm_backward_threshold_backward_3 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*i1', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp16', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_div_native_batch_norm_backward_threshold_backward_3(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 9633792 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x4 = (xindex // 49) | |
x1 = (xindex // 49) % 1536 | |
tmp0 = tl.load(in_ptr0 + (x3), None) | |
tmp2 = tl.load(in_ptr1 + (x4), None).to(tl.float32) | |
tmp7 = tl.load(in_ptr2 + (x3), None).to(tl.float32) | |
tmp9 = tl.load(in_ptr3 + (x1), None) | |
tmp11 = tl.load(in_ptr4 + (x1), None) | |
tmp14 = tl.load(in_ptr5 + (x1), None) | |
tmp19 = tl.load(in_ptr6 + (x1), None) | |
tmp22 = tl.load(in_ptr7 + (x1), None) | |
tmp1 = 0.0 | |
tmp3 = 49.0 | |
tmp4 = tmp2 / tmp3 | |
tmp5 = tl.where(tmp0, tmp1, tmp4) | |
tmp6 = tmp5.to(tl.float32) | |
tmp8 = tmp7.to(tl.float32) | |
tmp10 = tmp8 - tmp9 | |
tmp12 = 0.00015943877551020407 | |
tmp13 = tmp11 * tmp12 | |
tmp15 = tmp14 * tmp14 | |
tmp16 = tmp13 * tmp15 | |
tmp17 = tmp10 * tmp16 | |
tmp18 = tmp6 - tmp17 | |
tmp20 = tmp19 * tmp12 | |
tmp21 = tmp18 - tmp20 | |
tmp23 = tmp14 * tmp22 | |
tmp24 = tmp21 * tmp23 | |
tmp25 = tmp24.to(tl.float32) | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp25, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda:0', dtype=torch.bool) | |
arg_1 = rand_strided((128, 1536), (1536, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 1536, 1, 1), (1536, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_div_native_batch_norm_backward_threshold_backward_3.run(*args, 9633792, grid=grid(9633792), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_div_native_batch_norm_backward_threshold_backward_3.benchmark_all_configs(*args, 9633792, grid=grid(9633792)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/on/consb5x4mndq63g7yk4eommoi2zfnpkemgbkcrodivgrodwe4iek.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_407 | |
triton_poi_fused__to_copy_4 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[524288], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_4(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 405504 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None) | |
def get_args(): | |
arg_0 = rand_strided((1536, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1536, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_4.run(*args, 405504, grid=grid(405504), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_4.benchmark_all_configs(*args, 405504, grid=grid(405504)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/aw/cawn5o3lch25tbuhghexsvhkzdksvz4ofqrpjmzgs2y6ldb5vypm.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_395 | |
# aten.native_batch_norm_backward => convert_element_type_408, mul_479, mul_487, sub_62, sum_4, sum_5 | |
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_5 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[512, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 8), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_5(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 264 | |
rnumel = 6272 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
tmp5 = tl.load(in_ptr2 + (x0), xmask) | |
_tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 49 | |
r2 = (rindex // 49) | |
tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp3 = tl.load(in_ptr1 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
tmp4 = tmp3.to(tl.float32) | |
tmp6 = tmp4 - tmp5 | |
tmp7 = tmp1 * tmp6 | |
_tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8) | |
tmp2 = tl.sum(_tmp2, 1)[:, None] | |
tl.store(out_ptr0 + x0, tmp2, xmask) | |
tmp8 = tl.sum(_tmp8, 1)[:, None] | |
tl.store(out_ptr1 + x0, tmp8, xmask) | |
tmp9 = tl.load(in_ptr3 + (x0), xmask) | |
tmp10 = tmp8 * tmp9 | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_5.run(*args, 264, 6272, grid=grid(264), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_5.benchmark_all_configs(*args, 264, 6272, grid=grid(264)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/rk/crkzi7f3ev6knrr3kmx5lxzwin43giwhtaru4yx4tnkfj3r3ngqj.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_395 | |
# aten.native_batch_norm_backward => convert_element_type_408, convert_element_type_410, mul_485, mul_486, sub_62, sub_64, sub_65 | |
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_6 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp16', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_6(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1655808 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 49) % 264 | |
tmp0 = tl.load(in_ptr0 + (x3), xmask).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x3), xmask).to(tl.float32) | |
tmp4 = tl.load(in_ptr2 + (x1), xmask) | |
tmp6 = tl.load(in_ptr3 + (x1), xmask) | |
tmp9 = tl.load(in_ptr4 + (x1), xmask) | |
tmp14 = tl.load(in_ptr5 + (x1), xmask) | |
tmp17 = tl.load(in_ptr6 + (x1), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp2.to(tl.float32) | |
tmp5 = tmp3 - tmp4 | |
tmp7 = 0.00015943877551020407 | |
tmp8 = tmp6 * tmp7 | |
tmp10 = tmp9 * tmp9 | |
tmp11 = tmp8 * tmp10 | |
tmp12 = tmp5 * tmp11 | |
tmp13 = tmp1 - tmp12 | |
tmp15 = tmp14 * tmp7 | |
tmp16 = tmp13 - tmp15 | |
tmp18 = tmp9 * tmp17 | |
tmp19 = tmp16 * tmp18 | |
tmp20 = tmp19.to(tl.float32) | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp20, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_6.run(*args, 1655808, grid=grid(1655808), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_6.benchmark_all_configs(*args, 1655808, grid=grid(1655808)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/sf/csf7df3ryvp5mqalyhiqr2oqixspuwgupmdvdghhbdv4hoavgvib.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_411 | |
triton_poi_fused__to_copy_7 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_7(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 104544 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_7.run(*args, 104544, grid=grid(104544), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_7.benchmark_all_configs(*args, 104544, grid=grid(104544)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ph/cphopjqs73kagdjs5yonh2gxou75xcoyv4vg5cyr3nkxcgkuz4lp.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_41 | |
triton_poi_fused_cat_8 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_8(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 4967424 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 38808 | |
x1 = (xindex // 38808) | |
tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (77616*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 792, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_8.run(*args, 4967424, grid=grid(4967424), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_8.benchmark_all_configs(*args, 4967424, grid=grid(4967424)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/cd/ccdy3uu6zwicm5gp4nqvsu7scphal4x56e6djga5sd2lge4kiyge.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_41 | |
triton_poi_fused_cat_9 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_9(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 4967424 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 38808 | |
x1 = (xindex // 38808) | |
tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (77616*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 792, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_9.run(*args, 4967424, grid=grid(4967424), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_9.benchmark_all_configs(*args, 4967424, grid=grid(4967424)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/nt/cnt4hzkf7uueins7in5idjrq44bsdsgn6e67gfrc7gdtwfyotkdg.py | |
# Original ATen: aten.mul, aten.sigmoid, aten.sigmoid_backward, aten.silu, aten.sum | |
# aten.mul => mul_488 | |
# aten.sigmoid => sigmoid_63 | |
# aten.sigmoid_backward => convert_element_type_413, convert_element_type_414, convert_element_type_415, mul_490, mul_491, sub_66 | |
# aten.silu => convert_element_type_385, convert_element_type_386, mul_453, sigmoid_61 | |
# aten.sum => sum_6 | |
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_10 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[262144, 64], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_10(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 202752 | |
rnumel = 49 | |
RBLOCK: tl.constexpr = 64 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (r1 + (49*x0)), rmask, other=0).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (r1 + (49*x0)), rmask, other=0).to(tl.float32) | |
tmp11 = tl.load(in_ptr2 + (x0), None).to(tl.float32) | |
tmp2 = tmp1.to(tl.float32) | |
tmp3 = tl.sigmoid(tmp2) | |
tmp4 = tmp2 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tmp6 = tmp0 * tmp5 | |
tmp8 = tl.where(rmask, tmp6, 0) | |
tmp9 = tl.sum(tmp8, 1)[:, None] | |
tmp10 = tmp9.to(tl.float32) | |
tmp12 = tl.sigmoid(tmp11) | |
tmp13 = tmp12.to(tl.float32) | |
tmp14 = 1.0 | |
tmp15 = tmp14 - tmp13 | |
tmp16 = tmp13 * tmp15 | |
tmp17 = tmp10 * tmp16 | |
tmp18 = tmp17.to(tl.float32) | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp18, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_10.run(*args, 202752, 49, grid=grid(202752), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_10.benchmark_all_configs(*args, 202752, 49, grid=grid(202752)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/wr/cwr445l7k4eo6gcjw7l27usycvmf5jtqzbzk4btgcg2wqemciphj.py | |
# Original ATen: aten._to_copy, aten.convolution_backward | |
# aten._to_copy => convert_element_type_417 | |
# aten.convolution_backward => sum_7 | |
triton_per_fused__to_copy_convolution_backward_11 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[2048, 128], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__to_copy_convolution_backward_11(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 1584 | |
rnumel = 128 | |
RBLOCK: tl.constexpr = 128 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (1584*r1)), rmask & xmask, other=0).to(tl.float32) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__to_copy_convolution_backward_11.run(*args, 1584, 128, grid=grid(1584), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__to_copy_convolution_backward_11.benchmark_all_configs(*args, 1584, 128, grid=grid(1584)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/vy/cvylenxoapd6e7xfcegwowbaxf7tmjjits4rifowpwoo5xytchjc.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_416 | |
triton_poi_fused__to_copy_12 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_12(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 209088 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1584, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1584, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_12.run(*args, 209088, grid=grid(209088), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_12.benchmark_all_configs(*args, 209088, grid=grid(209088)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ku/ckublkguiu5gw5vxkulgbwk62mp5fhr6syry2wrk56p3zdpzupzp.py | |
# Original ATen: aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub | |
# aten.add => add_304 | |
# aten.clone => clone_47 | |
# aten.fill => full_like | |
# aten.mul => mul_492, mul_493, mul_494 | |
# aten.sigmoid => sigmoid_64 | |
# aten.sub => sub_67 | |
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_13 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_add_clone_fill_mul_sigmoid_sub_13(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 16896 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = 1.0 | |
tmp4 = tmp3 - tmp2 | |
tmp5 = tmp1 * tmp4 | |
tmp6 = tmp5 + tmp3 | |
tmp7 = tmp2 * tmp6 | |
tmp8 = tmp0 * tmp7 | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp8, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_13.run(*args, 16896, grid=grid(16896), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_add_clone_fill_mul_sigmoid_sub_13.benchmark_all_configs(*args, 16896, grid=grid(16896)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/4b/c4bhcaxi2hruo5hil4xfrkdmmucojrb5nsitzxo2oy2a6qcdg3ti.py | |
# Original ATen: aten._to_copy, aten.convolution_backward | |
# aten._to_copy => convert_element_type_419 | |
# aten.convolution_backward => sum_8 | |
triton_per_fused__to_copy_convolution_backward_14 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[256, 128], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 3), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__to_copy_convolution_backward_14(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 132 | |
rnumel = 128 | |
RBLOCK: tl.constexpr = 128 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (132*r1)), rmask & xmask, other=0).to(tl.float32) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((132,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__to_copy_convolution_backward_14.run(*args, 132, 128, grid=grid(132), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__to_copy_convolution_backward_14.benchmark_all_configs(*args, 132, 128, grid=grid(132)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/mo/cmoc7y64jtpxuncjb73voe54cmkijbltrpgf5p7stvr4mny7avka.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_418 | |
triton_poi_fused__to_copy_15 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_15(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 209088 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((132, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((132, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_15.run(*args, 209088, grid=grid(209088), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_15.benchmark_all_configs(*args, 209088, grid=grid(209088)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/et/cetgnuzvhwgtkuxuh7tdewyrx6qabbxsldzd2qnxluqsqq7zwwfh.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub | |
# aten._native_batch_norm_legit_functional => convert_element_type_383 | |
# aten.add => add_305, add_306 | |
# aten.clone => clone_46 | |
# aten.div => div_1 | |
# aten.fill => full_like_1 | |
# aten.mul => mul_489, mul_495, mul_496, mul_497 | |
# aten.native_batch_norm_backward => convert_element_type_420, mul_498, mul_506, sub_69, sum_10, sum_9 | |
# aten.sigmoid => sigmoid_63, sigmoid_65 | |
# aten.sub => sub_68 | |
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_16 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[2048, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_16(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 1584 | |
rnumel = 6272 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
tmp20 = tl.load(in_ptr5 + (x0), xmask) | |
_tmp23 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 49 | |
r2 = (rindex // 49) | |
tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (x0 + (1584*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp4 = tl.load(in_ptr2 + (x0 + (1584*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp8 = tl.load(in_ptr3 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp18 = tl.load(in_ptr4 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp0 * tmp2 | |
tmp5 = 49.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = tmp3 + tmp6 | |
tmp9 = tl.sigmoid(tmp8) | |
tmp10 = 1.0 | |
tmp11 = tmp10 - tmp9 | |
tmp12 = tmp8 * tmp11 | |
tmp13 = tmp12 + tmp10 | |
tmp14 = tmp9 * tmp13 | |
tmp15 = tmp7 * tmp14 | |
tmp16 = tmp15.to(tl.float32) | |
_tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17) | |
tmp19 = tmp18.to(tl.float32) | |
tmp21 = tmp19 - tmp20 | |
tmp22 = tmp16 * tmp21 | |
_tmp23 = tl.where(rmask & xmask, _tmp23 + tmp22, _tmp23) | |
tmp17 = tl.sum(_tmp17, 1)[:, None] | |
tl.store(out_ptr0 + x0, tmp17, xmask) | |
tmp23 = tl.sum(_tmp23, 1)[:, None] | |
tl.store(out_ptr1 + x0, tmp23, xmask) | |
tmp24 = tl.load(in_ptr6 + (x0), xmask) | |
tmp25 = tmp23 * tmp24 | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp25, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_5 = rand_strided((1, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_9 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_16.run(*args, 1584, 6272, grid=grid(1584), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_16.benchmark_all_configs(*args, 1584, 6272, grid=grid(1584)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/vq/cvqzgmyak74w6zf55f32ognvdbwijbc4xvxna4rdk6flp32c4rx3.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub | |
# aten._native_batch_norm_legit_functional => convert_element_type_383 | |
# aten.add => add_305, add_306 | |
# aten.clone => clone_46 | |
# aten.div => div_1 | |
# aten.fill => full_like_1 | |
# aten.mul => mul_489, mul_495, mul_496, mul_497 | |
# aten.native_batch_norm_backward => convert_element_type_420, mul_504, sub_69, sub_71, sub_72 | |
# aten.sigmoid => sigmoid_63, sigmoid_65 | |
# aten.sub => sub_68 | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_17 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_17(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 9934848 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x4 = (xindex // 49) | |
x1 = (xindex // 49) % 1584 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32) | |
tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32) | |
tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32) | |
tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32) | |
tmp19 = tl.load(in_ptr5 + (x1), None) | |
tmp21 = tl.load(in_ptr6 + (x1), None) | |
tmp24 = tl.load(in_ptr7 + (x1), None) | |
tmp29 = tl.load(in_ptr8 + (x1), None) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp0 * tmp2 | |
tmp5 = 49.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = tmp3 + tmp6 | |
tmp9 = tl.sigmoid(tmp8) | |
tmp10 = 1.0 | |
tmp11 = tmp10 - tmp9 | |
tmp12 = tmp8 * tmp11 | |
tmp13 = tmp12 + tmp10 | |
tmp14 = tmp9 * tmp13 | |
tmp15 = tmp7 * tmp14 | |
tmp16 = tmp15.to(tl.float32) | |
tmp18 = tmp17.to(tl.float32) | |
tmp20 = tmp18 - tmp19 | |
tmp22 = 0.00015943877551020407 | |
tmp23 = tmp21 * tmp22 | |
tmp25 = tmp24 * tmp24 | |
tmp26 = tmp23 * tmp25 | |
tmp27 = tmp20 * tmp26 | |
tmp28 = tmp16 - tmp27 | |
tmp30 = tmp29 * tmp22 | |
tmp31 = tmp28 - tmp30 | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp31, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_5 = rand_strided((1, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_9 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_17.run(*args, 9934848, grid=grid(9934848), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_17.benchmark_all_configs(*args, 9934848, grid=grid(9934848)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/4k/c4k5hrio7ng5ql4zfsckty7aijjdguidt3u7uu2czayedkp6afcd.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_5 | |
triton_poi_fused_convolution_backward_18 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_18(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2483712 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = (xindex // 19404) | |
x3 = xindex % 19404 | |
x1 = (xindex // 49) % 396 | |
x4 = xindex | |
tmp0 = tl.load(in_ptr0 + (58212 + x3 + (77616*x2)), xmask) | |
tmp1 = tl.load(in_ptr1 + (1188 + x1), xmask) | |
tmp2 = tl.load(in_ptr2 + (1188 + x1), xmask) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp0 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_18.run(*args, 2483712, grid=grid(2483712), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_18.benchmark_all_configs(*args, 2483712, grid=grid(2483712)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/le/clevrgmaajodoycxcxjtmbwkkhfu2zyqxjmzkxois2uwwej5n47c.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_423 | |
triton_poi_fused__to_copy_19 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_19(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 32076 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((396, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((396, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_19.run(*args, 32076, grid=grid(32076), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_19.benchmark_all_configs(*args, 32076, grid=grid(32076)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/yx/cyxchtoyxooc3k4ah4othvbsdd77dfkbeykdlyzawbwqv4zeq227.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_6 | |
triton_poi_fused_convolution_backward_20 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_20(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2483712 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = (xindex // 19404) | |
x3 = xindex % 19404 | |
x1 = (xindex // 49) % 396 | |
x4 = xindex | |
tmp0 = tl.load(in_ptr0 + (38808 + x3 + (77616*x2)), xmask) | |
tmp1 = tl.load(in_ptr1 + (792 + x1), xmask) | |
tmp2 = tl.load(in_ptr2 + (792 + x1), xmask) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp0 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_20.run(*args, 2483712, grid=grid(2483712), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_20.benchmark_all_configs(*args, 2483712, grid=grid(2483712)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/eb/cebicyjdxyev2ctnv4eqoljt3ynagd7ac2xafkomzznzw2akhxue.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_424 | |
triton_poi_fused__to_copy_21 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_21(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 19404 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((396, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((396, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_21.run(*args, 19404, grid=grid(19404), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_21.benchmark_all_configs(*args, 19404, grid=grid(19404)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/n6/cn6zlzicefkme4u5z2u7vgzu7zs36znck62asy3a53dhtgwtnfey.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_7 | |
triton_poi_fused_convolution_backward_22 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_22(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2483712 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = (xindex // 19404) | |
x3 = xindex % 19404 | |
x1 = (xindex // 49) % 396 | |
x4 = xindex | |
tmp0 = tl.load(in_ptr0 + (19404 + x3 + (77616*x2)), xmask) | |
tmp1 = tl.load(in_ptr1 + (396 + x1), xmask) | |
tmp2 = tl.load(in_ptr2 + (396 + x1), xmask) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp0 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_22.run(*args, 2483712, grid=grid(2483712), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_22.benchmark_all_configs(*args, 2483712, grid=grid(2483712)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/5j/c5jiqoftvlcj3ob2vawgml2t4mfu74vouj3eu6ozelrenm5a7frc.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_425 | |
triton_poi_fused__to_copy_23 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_23(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 9900 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((396, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((396, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_23.run(*args, 9900, grid=grid(9900), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_23.benchmark_all_configs(*args, 9900, grid=grid(9900)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/wd/cwd5qp57lsltp4fq4manegrstgskzimmaukeha4dh4fnqhtwaydf.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_8 | |
triton_poi_fused_convolution_backward_24 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_24(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2483712 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = (xindex // 19404) | |
x3 = xindex % 19404 | |
x1 = (xindex // 49) % 396 | |
x4 = xindex | |
tmp0 = tl.load(in_ptr0 + (x3 + (77616*x2)), xmask) | |
tmp1 = tl.load(in_ptr1 + (x1), xmask) | |
tmp2 = tl.load(in_ptr2 + (x1), xmask) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp0 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_24.run(*args, 2483712, grid=grid(2483712), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_24.benchmark_all_configs(*args, 2483712, grid=grid(2483712)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/wc/cwc7i4csdkpcwn5beyvhsemwhtbq7gxbezkowlgoa2xhw4y7krz2.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_426 | |
triton_poi_fused__to_copy_25 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_25(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3564 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((396, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((396, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_25.run(*args, 3564, grid=grid(3564), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_25.benchmark_all_configs(*args, 3564, grid=grid(3564)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/cp/ccpi5jja3x5efmayn24o6k4nqs6v2zc5jff75of6woeln6emtny2.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_42 | |
triton_poi_fused_cat_26 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_26(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2483712 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 19404 | |
x1 = (xindex // 19404) | |
tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (77616*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 396, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_26.run(*args, 2483712, grid=grid(2483712), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_26.benchmark_all_configs(*args, 2483712, grid=grid(2483712)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/rg/crgc5ihkw5zltfd3tfchnahu2yx2wga4mambpixjsb7mmeynkmrh.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_42 | |
triton_poi_fused_cat_27 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_27(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2483712 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 19404 | |
x1 = (xindex // 19404) | |
tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (77616*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 396, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_27.run(*args, 2483712, grid=grid(2483712), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_27.benchmark_all_configs(*args, 2483712, grid=grid(2483712)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/hq/chqi5nwugaohgw3qon7t3fytiqnvkhkemtghder4rphksh2dt7uc.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.mul, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_375 | |
# aten.mul => mul_509 | |
# aten.native_batch_norm_backward => convert_element_type_427, mul_510, mul_518, sub_74, sum_11, sum_12 | |
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_28 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[2048, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_28(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 1584 | |
rnumel = 6272 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
tmp7 = tl.load(in_ptr3 + (x0), xmask) | |
_tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 49 | |
r2 = (rindex // 49) | |
tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp5 = tl.load(in_ptr2 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp2 = tmp0 * tmp1 | |
tmp3 = tmp2.to(tl.float32) | |
_tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4) | |
tmp6 = tmp5.to(tl.float32) | |
tmp8 = tmp6 - tmp7 | |
tmp9 = tmp3 * tmp8 | |
_tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10) | |
tmp4 = tl.sum(_tmp4, 1)[:, None] | |
tl.store(out_ptr0 + x0, tmp4, xmask) | |
tmp10 = tl.sum(_tmp10, 1)[:, None] | |
tl.store(out_ptr1 + x0, tmp10, xmask) | |
tmp11 = tl.load(in_ptr4 + (x0), xmask) | |
tmp12 = tmp10 * tmp11 | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp12, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_28.run(*args, 1584, 6272, grid=grid(1584), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_28.benchmark_all_configs(*args, 1584, 6272, grid=grid(1584)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/4i/c4iqo4zwyswlhwfnswymdycvvcfufifa735bkh3k5fjuqlgdvt3q.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.convolution_backward, aten.mul, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_375 | |
# aten.convolution_backward => convolution_backward_9 | |
# aten.mul => mul_509 | |
# aten.native_batch_norm_backward => convert_element_type_427, convert_element_type_429, mul_516, mul_517, sub_74, sub_76, sub_77 | |
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_29 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_29(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 9934848 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 49) % 1584 | |
tmp0 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp4 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
tmp6 = tl.load(in_ptr2 + (x1), None) | |
tmp8 = tl.load(in_ptr3 + (x1), None) | |
tmp11 = tl.load(in_ptr4 + (x1), None) | |
tmp16 = tl.load(in_ptr5 + (x1), None) | |
tmp19 = tl.load(in_ptr6 + (x1), None) | |
tmp2 = tmp0 * tmp1 | |
tmp3 = tmp2.to(tl.float32) | |
tmp5 = tmp4.to(tl.float32) | |
tmp7 = tmp5 - tmp6 | |
tmp9 = 0.00015943877551020407 | |
tmp10 = tmp8 * tmp9 | |
tmp12 = tmp11 * tmp11 | |
tmp13 = tmp10 * tmp12 | |
tmp14 = tmp7 * tmp13 | |
tmp15 = tmp3 - tmp14 | |
tmp17 = tmp16 * tmp9 | |
tmp18 = tmp15 - tmp17 | |
tmp20 = tmp11 * tmp19 | |
tmp21 = tmp18 * tmp20 | |
tmp22 = tmp21.to(tl.float32) | |
tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_29.run(*args, 9934848, grid=grid(9934848), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_29.benchmark_all_configs(*args, 9934848, grid=grid(9934848)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/np/cnpskkb6q6odcd2dqw7z7r4e7p4cgfh5g7s5kyt2zmqwzsyn7azj.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_430 | |
triton_poi_fused__to_copy_30 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[524288], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_30(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 418176 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1584, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1584, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_30.run(*args, 418176, grid=grid(418176), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_30.benchmark_all_configs(*args, 418176, grid=grid(418176)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ut/cuta2ldfadubtnwekrao3mdwks6gtwerw2rxvha7uaqnloaeqvfv.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_372 | |
# aten.add => add_308 | |
# aten.native_batch_norm_backward => convert_element_type_431, mul_519, mul_527, sub_78, sum_13, sum_14 | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_31 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[512, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 9), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_31(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 264 | |
rnumel = 6272 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
tmp7 = tl.load(in_ptr3 + (x0), xmask) | |
_tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 49 | |
r2 = (rindex // 49) | |
tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp5 = tl.load(in_ptr2 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp2 = tmp0 + tmp1 | |
tmp3 = tmp2.to(tl.float32) | |
_tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4) | |
tmp6 = tmp5.to(tl.float32) | |
tmp8 = tmp6 - tmp7 | |
tmp9 = tmp3 * tmp8 | |
_tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10) | |
tmp4 = tl.sum(_tmp4, 1)[:, None] | |
tl.store(out_ptr0 + x0, tmp4, xmask) | |
tmp10 = tl.sum(_tmp10, 1)[:, None] | |
tl.store(out_ptr1 + x0, tmp10, xmask) | |
tmp11 = tl.load(in_ptr4 + (x0), xmask) | |
tmp12 = tmp10 * tmp11 | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp12, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_31.run(*args, 264, 6272, grid=grid(264), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_31.benchmark_all_configs(*args, 264, 6272, grid=grid(264)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/a3/ca33o5ukggucnj2hrtjksvzkzfzfe3v4qnwzz5ycytjwaauqcvop.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_372 | |
# aten.add => add_308 | |
# aten.native_batch_norm_backward => convert_element_type_431, convert_element_type_433, mul_525, mul_526, sub_78, sub_80, sub_81 | |
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_32 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp16', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_32(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1655808 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 49) % 264 | |
tmp0 = tl.load(in_ptr0 + (x3), xmask).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (x3), xmask).to(tl.float32) | |
tmp4 = tl.load(in_ptr2 + (x3), xmask).to(tl.float32) | |
tmp6 = tl.load(in_ptr3 + (x1), xmask) | |
tmp8 = tl.load(in_ptr4 + (x1), xmask) | |
tmp11 = tl.load(in_ptr5 + (x1), xmask) | |
tmp16 = tl.load(in_ptr6 + (x1), xmask) | |
tmp19 = tl.load(in_ptr7 + (x1), xmask) | |
tmp2 = tmp0 + tmp1 | |
tmp3 = tmp2.to(tl.float32) | |
tmp5 = tmp4.to(tl.float32) | |
tmp7 = tmp5 - tmp6 | |
tmp9 = 0.00015943877551020407 | |
tmp10 = tmp8 * tmp9 | |
tmp12 = tmp11 * tmp11 | |
tmp13 = tmp10 * tmp12 | |
tmp14 = tmp7 * tmp13 | |
tmp15 = tmp3 - tmp14 | |
tmp17 = tmp16 * tmp9 | |
tmp18 = tmp15 - tmp17 | |
tmp20 = tmp11 * tmp19 | |
tmp21 = tmp18 * tmp20 | |
tmp22 = tmp21.to(tl.float32) | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_32.run(*args, 1655808, grid=grid(1655808), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_32.benchmark_all_configs(*args, 1655808, grid=grid(1655808)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/hd/chdicakkg6zydfy4ll5rit2mnugrbccllk6kssfzzb7i55y4shxu.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_349 | |
# aten.add => add_308, add_313 | |
# aten.native_batch_norm_backward => convert_element_type_454, mul_559, mul_567, sub_94, sum_22, sum_23 | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_33 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[512, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: 'i32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 10), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_33(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 264 | |
rnumel = 6272 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
tmp9 = tl.load(in_ptr4 + (x0), xmask) | |
_tmp12 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 49 | |
r2 = (rindex // 49) | |
tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp3 = tl.load(in_ptr2 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp7 = tl.load(in_ptr3 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp2 = tmp0 + tmp1 | |
tmp4 = tmp2 + tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
_tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6) | |
tmp8 = tmp7.to(tl.float32) | |
tmp10 = tmp8 - tmp9 | |
tmp11 = tmp5 * tmp10 | |
_tmp12 = tl.where(rmask & xmask, _tmp12 + tmp11, _tmp12) | |
tmp6 = tl.sum(_tmp6, 1)[:, None] | |
tl.store(out_ptr0 + x0, tmp6, xmask) | |
tmp12 = tl.sum(_tmp12, 1)[:, None] | |
tl.store(out_ptr1 + x0, tmp12, xmask) | |
tmp13 = tl.load(in_ptr5 + (x0), xmask) | |
tmp14 = tmp12 * tmp13 | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp14, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_33.run(*args, 264, 6272, grid=grid(264), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_33.benchmark_all_configs(*args, 264, 6272, grid=grid(264)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/bi/cbij3cjdqcet3nxl6uy6e6kdaxljedzlmbuwgif6yfmwghh7x42z.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_349 | |
# aten.add => add_308, add_313 | |
# aten.native_batch_norm_backward => convert_element_type_454, mul_565, mul_566, sub_94, sub_96, sub_97 | |
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_34 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_34(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1655808 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 49) % 264 | |
tmp0 = tl.load(in_ptr0 + (x3), xmask).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (x3), xmask).to(tl.float32) | |
tmp3 = tl.load(in_ptr2 + (x3), xmask).to(tl.float32) | |
tmp6 = tl.load(in_ptr3 + (x3), xmask).to(tl.float32) | |
tmp8 = tl.load(in_ptr4 + (x1), xmask) | |
tmp10 = tl.load(in_ptr5 + (x1), xmask) | |
tmp13 = tl.load(in_ptr6 + (x1), xmask) | |
tmp18 = tl.load(in_ptr7 + (x1), xmask) | |
tmp21 = tl.load(in_ptr8 + (x1), xmask) | |
tmp2 = tmp0 + tmp1 | |
tmp4 = tmp2 + tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tmp7 = tmp6.to(tl.float32) | |
tmp9 = tmp7 - tmp8 | |
tmp11 = 0.00015943877551020407 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp13 * tmp13 | |
tmp15 = tmp12 * tmp14 | |
tmp16 = tmp9 * tmp15 | |
tmp17 = tmp5 - tmp16 | |
tmp19 = tmp18 * tmp11 | |
tmp20 = tmp17 - tmp19 | |
tmp22 = tmp13 * tmp21 | |
tmp23 = tmp20 * tmp22 | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp23, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_9 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_34.run(*args, 1655808, grid=grid(1655808), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_34.benchmark_all_configs(*args, 1655808, grid=grid(1655808)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/bp/cbpqapqwkj3u7lbjggnc6sqbtdedbchks5ecsczowcbbvpcuekir.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_19 | |
triton_poi_fused_convolution_backward_35 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[1048576], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_35(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 827904 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 6468 | |
x1 = (xindex // 6468) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (6468 + x0 + (12936*x1)), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((128, 132, 7, 7), (6468, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_35.run(*args, 827904, grid=grid(827904), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_35.benchmark_all_configs(*args, 827904, grid=grid(827904)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/vz/cvzmxblv4iaxi55wdmzkqdehpwn2a2c5kowaaewxyzttkqwmv7jg.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_20 | |
triton_poi_fused_convolution_backward_36 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[1048576], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_36(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 827904 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 6468 | |
x1 = (xindex // 6468) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (12936*x1)), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((128, 132, 7, 7), (6468, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_36.run(*args, 827904, grid=grid(827904), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_36.benchmark_all_configs(*args, 827904, grid=grid(827904)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/sb/csb7dsvsxbdzlofqw5ldfragxm7igefprtjmu6wizsp5shcyxiyp.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_326 | |
# aten.add => add_308, add_313, add_318 | |
# aten.native_batch_norm_backward => convert_element_type_477, mul_599, mul_607, sub_110, sum_31, sum_32 | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_37 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[512, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_37(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 264 | |
rnumel = 6272 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
tmp11 = tl.load(in_ptr5 + (x0), xmask) | |
_tmp14 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 49 | |
r2 = (rindex // 49) | |
tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp3 = tl.load(in_ptr2 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp5 = tl.load(in_ptr3 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp9 = tl.load(in_ptr4 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp2 = tmp0 + tmp1 | |
tmp4 = tmp2 + tmp3 | |
tmp6 = tmp4 + tmp5 | |
tmp7 = tmp6.to(tl.float32) | |
_tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8) | |
tmp10 = tmp9.to(tl.float32) | |
tmp12 = tmp10 - tmp11 | |
tmp13 = tmp7 * tmp12 | |
_tmp14 = tl.where(rmask & xmask, _tmp14 + tmp13, _tmp14) | |
tmp8 = tl.sum(_tmp8, 1)[:, None] | |
tl.store(out_ptr0 + x0, tmp8, xmask) | |
tmp14 = tl.sum(_tmp14, 1)[:, None] | |
tl.store(out_ptr1 + x0, tmp14, xmask) | |
tmp15 = tl.load(in_ptr6 + (x0), xmask) | |
tmp16 = tmp14 * tmp15 | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_5 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_9 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_37.run(*args, 264, 6272, grid=grid(264), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_37.benchmark_all_configs(*args, 264, 6272, grid=grid(264)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/gx/cgx3bvwbhn4322e66zkvf7xmr6ch3vg75ylfjn7yzts64xagqr2l.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.convolution_backward, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_326 | |
# aten.add => add_308, add_313, add_318 | |
# aten.convolution_backward => convolution_backward_28 | |
# aten.native_batch_norm_backward => convert_element_type_477, convert_element_type_479, mul_605, mul_606, sub_110, sub_112, sub_113 | |
triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_38 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: '*fp16', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_38(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, in_ptr9, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1655808 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 49) % 264 | |
tmp0 = tl.load(in_ptr0 + (x3), xmask).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (x3), xmask).to(tl.float32) | |
tmp3 = tl.load(in_ptr2 + (x3), xmask).to(tl.float32) | |
tmp5 = tl.load(in_ptr3 + (x3), xmask).to(tl.float32) | |
tmp8 = tl.load(in_ptr4 + (x3), xmask).to(tl.float32) | |
tmp10 = tl.load(in_ptr5 + (x1), xmask) | |
tmp12 = tl.load(in_ptr6 + (x1), xmask) | |
tmp15 = tl.load(in_ptr7 + (x1), xmask) | |
tmp20 = tl.load(in_ptr8 + (x1), xmask) | |
tmp23 = tl.load(in_ptr9 + (x1), xmask) | |
tmp2 = tmp0 + tmp1 | |
tmp4 = tmp2 + tmp3 | |
tmp6 = tmp4 + tmp5 | |
tmp7 = tmp6.to(tl.float32) | |
tmp9 = tmp8.to(tl.float32) | |
tmp11 = tmp9 - tmp10 | |
tmp13 = 0.00015943877551020407 | |
tmp14 = tmp12 * tmp13 | |
tmp16 = tmp15 * tmp15 | |
tmp17 = tmp14 * tmp16 | |
tmp18 = tmp11 * tmp17 | |
tmp19 = tmp7 - tmp18 | |
tmp21 = tmp20 * tmp13 | |
tmp22 = tmp19 - tmp21 | |
tmp24 = tmp15 * tmp23 | |
tmp25 = tmp22 * tmp24 | |
tmp26 = tmp25.to(tl.float32) | |
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp26, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_5 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_9 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_10 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, arg_10, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_38.run(*args, 1655808, grid=grid(1655808), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_38.benchmark_all_configs(*args, 1655808, grid=grid(1655808)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/hv/chvxalrqvfx3gumbgl7gg6d4lhqqg37lbadisxr7scsc3tjuvlp6.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_480 | |
triton_poi_fused__to_copy_39 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_39(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 253440 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((264, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((264, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_39.run(*args, 253440, grid=grid(253440), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_39.benchmark_all_configs(*args, 253440, grid=grid(253440)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/4c/c4cxprfywuviqaunnoosyvctdmc2wg7xuk2qc4eawthvjfi2nh6c.py | |
# Original ATen: aten.mul, aten.sigmoid, aten.sigmoid_backward, aten.silu, aten.sum | |
# aten.mul => mul_608 | |
# aten.sigmoid => sigmoid_51 | |
# aten.sigmoid_backward => convert_element_type_481, convert_element_type_482, convert_element_type_483, mul_610, mul_611, sub_114 | |
# aten.silu => convert_element_type_317, convert_element_type_318, mul_378, sigmoid_49 | |
# aten.sum => sum_33 | |
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_40 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[131072, 64], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_40(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 122880 | |
rnumel = 49 | |
RBLOCK: tl.constexpr = 64 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (r1 + (49*x0)), rmask, other=0).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (r1 + (49*x0)), rmask, other=0).to(tl.float32) | |
tmp11 = tl.load(in_ptr2 + (x0), None).to(tl.float32) | |
tmp2 = tmp1.to(tl.float32) | |
tmp3 = tl.sigmoid(tmp2) | |
tmp4 = tmp2 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tmp6 = tmp0 * tmp5 | |
tmp8 = tl.where(rmask, tmp6, 0) | |
tmp9 = tl.sum(tmp8, 1)[:, None] | |
tmp10 = tmp9.to(tl.float32) | |
tmp12 = tl.sigmoid(tmp11) | |
tmp13 = tmp12.to(tl.float32) | |
tmp14 = 1.0 | |
tmp15 = tmp14 - tmp13 | |
tmp16 = tmp13 * tmp15 | |
tmp17 = tmp10 * tmp16 | |
tmp18 = tmp17.to(tl.float32) | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp18, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_40.run(*args, 122880, 49, grid=grid(122880), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_40.benchmark_all_configs(*args, 122880, 49, grid=grid(122880)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/c3/cc3ozjdbl56ribso2ymcx7ozxswqoswegte5cm6ndwts2vdg2nx7.py | |
# Original ATen: aten._to_copy, aten.convolution_backward | |
# aten._to_copy => convert_element_type_485 | |
# aten.convolution_backward => sum_34 | |
triton_per_fused__to_copy_convolution_backward_41 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[1024, 128], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__to_copy_convolution_backward_41(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 960 | |
rnumel = 128 | |
RBLOCK: tl.constexpr = 128 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (960*r1)), rmask & xmask, other=0).to(tl.float32) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__to_copy_convolution_backward_41.run(*args, 960, 128, grid=grid(960), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__to_copy_convolution_backward_41.benchmark_all_configs(*args, 960, 128, grid=grid(960)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/gr/cgrr3v6svqr3e2yhrucjdveyox77wsfeqkauijcny3rj2xckclad.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_484 | |
triton_poi_fused__to_copy_42 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_42(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 76800 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((960, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((960, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_42.run(*args, 76800, grid=grid(76800), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_42.benchmark_all_configs(*args, 76800, grid=grid(76800)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ec/cecvpjtjrr7ig7pxbkmaghimjhrgacci4o2zw2nv422yxcdjktnf.py | |
# Original ATen: aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub | |
# aten.add => add_319 | |
# aten.clone => clone_38 | |
# aten.fill => full_like_9 | |
# aten.mul => mul_612, mul_613, mul_614 | |
# aten.sigmoid => sigmoid_73 | |
# aten.sub => sub_115 | |
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_43 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_add_clone_fill_mul_sigmoid_sub_43(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 10240 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_out_ptr0 + (x0), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = 1.0 | |
tmp4 = tmp3 - tmp2 | |
tmp5 = tmp1 * tmp4 | |
tmp6 = tmp5 + tmp3 | |
tmp7 = tmp2 * tmp6 | |
tmp8 = tmp0 * tmp7 | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp8, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_43.run(*args, 10240, grid=grid(10240), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_add_clone_fill_mul_sigmoid_sub_43.benchmark_all_configs(*args, 10240, grid=grid(10240)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/tf/ctfzhkol6r3oykhxrse2zzxlztwo7vdgfbyprm7vgxoac6npm7jy.py | |
# Original ATen: aten._to_copy, aten.convolution_backward | |
# aten._to_copy => convert_element_type_487 | |
# aten.convolution_backward => sum_35 | |
triton_per_fused__to_copy_convolution_backward_44 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[128, 128], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__to_copy_convolution_backward_44(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 80 | |
rnumel = 128 | |
RBLOCK: tl.constexpr = 128 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (80*r1)), rmask & xmask, other=0).to(tl.float32) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((80,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__to_copy_convolution_backward_44.run(*args, 80, 128, grid=grid(80), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__to_copy_convolution_backward_44.benchmark_all_configs(*args, 80, 128, grid=grid(80)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/pa/cpatc2nbbyd25gudjvjrjc54kh5oivkwx63acex4sswslv6ylzvi.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_486 | |
triton_poi_fused__to_copy_45 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_45(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 76800 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((80, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((80, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_45.run(*args, 76800, grid=grid(76800), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_45.benchmark_all_configs(*args, 76800, grid=grid(76800)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/t2/ct2iy46nsnqymwwljjboc6oz6xdfqwrjdsjacocdhzt74zuufoj7.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub | |
# aten._native_batch_norm_legit_functional => convert_element_type_315 | |
# aten.add => add_320, add_321 | |
# aten.clone => clone_37 | |
# aten.div => div_4 | |
# aten.fill => full_like_10 | |
# aten.mul => mul_609, mul_615, mul_616, mul_617 | |
# aten.native_batch_norm_backward => convert_element_type_488, mul_618, mul_626, sub_117, sum_36, sum_37 | |
# aten.sigmoid => sigmoid_51, sigmoid_74 | |
# aten.sub => sub_116 | |
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_46 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[1024, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_46(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 960 | |
rnumel = 6272 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
tmp20 = tl.load(in_ptr5 + (x0), xmask) | |
_tmp23 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 49 | |
r2 = (rindex // 49) | |
tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (47040*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (x0 + (960*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp4 = tl.load(in_ptr2 + (x0 + (960*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp8 = tl.load(in_ptr3 + (r1 + (49*x0) + (47040*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp18 = tl.load(in_ptr4 + (r1 + (49*x0) + (47040*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp0 * tmp2 | |
tmp5 = 49.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = tmp3 + tmp6 | |
tmp9 = tl.sigmoid(tmp8) | |
tmp10 = 1.0 | |
tmp11 = tmp10 - tmp9 | |
tmp12 = tmp8 * tmp11 | |
tmp13 = tmp12 + tmp10 | |
tmp14 = tmp9 * tmp13 | |
tmp15 = tmp7 * tmp14 | |
tmp16 = tmp15.to(tl.float32) | |
_tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17) | |
tmp19 = tmp18.to(tl.float32) | |
tmp21 = tmp19 - tmp20 | |
tmp22 = tmp16 * tmp21 | |
_tmp23 = tl.where(rmask & xmask, _tmp23 + tmp22, _tmp23) | |
tmp17 = tl.sum(_tmp17, 1)[:, None] | |
tl.store(out_ptr0 + x0, tmp17, xmask) | |
tmp23 = tl.sum(_tmp23, 1)[:, None] | |
tl.store(out_ptr1 + x0, tmp23, xmask) | |
tmp24 = tl.load(in_ptr6 + (x0), xmask) | |
tmp25 = tmp23 * tmp24 | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp25, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_5 = rand_strided((1, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_9 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_46.run(*args, 960, 6272, grid=grid(960), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_46.benchmark_all_configs(*args, 960, 6272, grid=grid(960)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/hn/chn3tshreitokdet2wgwgbsbuqqplci2k63sdatoy3lq6bjeylvs.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub | |
# aten._native_batch_norm_legit_functional => convert_element_type_315 | |
# aten.add => add_320, add_321 | |
# aten.clone => clone_37 | |
# aten.div => div_4 | |
# aten.fill => full_like_10 | |
# aten.mul => mul_609, mul_615, mul_616, mul_617 | |
# aten.native_batch_norm_backward => convert_element_type_488, mul_624, sub_117, sub_119, sub_120 | |
# aten.sigmoid => sigmoid_51, sigmoid_74 | |
# aten.sub => sub_116 | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_47 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_47(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 6021120 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x4 = (xindex // 49) | |
x1 = (xindex // 49) % 960 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32) | |
tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32) | |
tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32) | |
tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32) | |
tmp19 = tl.load(in_ptr5 + (x1), None) | |
tmp21 = tl.load(in_ptr6 + (x1), None) | |
tmp24 = tl.load(in_ptr7 + (x1), None) | |
tmp29 = tl.load(in_ptr8 + (x1), None) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp0 * tmp2 | |
tmp5 = 49.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = tmp3 + tmp6 | |
tmp9 = tl.sigmoid(tmp8) | |
tmp10 = 1.0 | |
tmp11 = tmp10 - tmp9 | |
tmp12 = tmp8 * tmp11 | |
tmp13 = tmp12 + tmp10 | |
tmp14 = tmp9 * tmp13 | |
tmp15 = tmp7 * tmp14 | |
tmp16 = tmp15.to(tl.float32) | |
tmp18 = tmp17.to(tl.float32) | |
tmp20 = tmp18 - tmp19 | |
tmp22 = 0.00015943877551020407 | |
tmp23 = tmp21 * tmp22 | |
tmp25 = tmp24 * tmp24 | |
tmp26 = tmp23 * tmp25 | |
tmp27 = tmp20 * tmp26 | |
tmp28 = tmp16 - tmp27 | |
tmp30 = tmp29 * tmp22 | |
tmp31 = tmp28 - tmp30 | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp31, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_5 = rand_strided((1, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_9 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_47.run(*args, 6021120, grid=grid(6021120), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_47.benchmark_all_configs(*args, 6021120, grid=grid(6021120)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/cm/ccm5w6opbpyzojqp4qntjyz3d5dzh7ybk3ty3hgcp6xmwewzvdux.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_31 | |
triton_poi_fused_convolution_backward_48 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_48(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1505280 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = (xindex // 11760) | |
x3 = xindex % 11760 | |
x1 = (xindex // 49) % 240 | |
x4 = xindex | |
tmp0 = tl.load(in_ptr0 + (35280 + x3 + (47040*x2)), None) | |
tmp1 = tl.load(in_ptr1 + (720 + x1), None) | |
tmp2 = tl.load(in_ptr2 + (720 + x1), None) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp0 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((128, 240, 7, 7), (11760, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_48.run(*args, 1505280, grid=grid(1505280), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_48.benchmark_all_configs(*args, 1505280, grid=grid(1505280)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/bg/cbgklbbew275guj34lbxmgcsod4kcchr5225smzpdyhnn7zswlfw.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_491 | |
triton_poi_fused__to_copy_49 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_49(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 19440 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((240, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((240, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_49.run(*args, 19440, grid=grid(19440), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_49.benchmark_all_configs(*args, 19440, grid=grid(19440)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/aa/caatae3nbvudrfnsq366oeojppmx457zjpvs4yumsehtdmsg23py.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_32 | |
triton_poi_fused_convolution_backward_50 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_50(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1505280 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = (xindex // 11760) | |
x3 = xindex % 11760 | |
x1 = (xindex // 49) % 240 | |
x4 = xindex | |
tmp0 = tl.load(in_ptr0 + (23520 + x3 + (47040*x2)), None) | |
tmp1 = tl.load(in_ptr1 + (480 + x1), None) | |
tmp2 = tl.load(in_ptr2 + (480 + x1), None) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp0 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((128, 240, 7, 7), (11760, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_50.run(*args, 1505280, grid=grid(1505280), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_50.benchmark_all_configs(*args, 1505280, grid=grid(1505280)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/iy/ciyzljcwrx3unj6zlla5nxhjqwjz6zppzmuz7wma6tcow2pjfjva.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_492 | |
triton_poi_fused__to_copy_51 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_51(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 11760 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((240, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((240, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_51.run(*args, 11760, grid=grid(11760), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_51.benchmark_all_configs(*args, 11760, grid=grid(11760)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/6i/c6izu5jaho2aqwpjlvtonq5ar6inxincvvpnnrzzj3npyab2r3ga.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_33 | |
triton_poi_fused_convolution_backward_52 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_52(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1505280 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = (xindex // 11760) | |
x3 = xindex % 11760 | |
x1 = (xindex // 49) % 240 | |
x4 = xindex | |
tmp0 = tl.load(in_ptr0 + (11760 + x3 + (47040*x2)), None) | |
tmp1 = tl.load(in_ptr1 + (240 + x1), None) | |
tmp2 = tl.load(in_ptr2 + (240 + x1), None) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp0 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((128, 240, 7, 7), (11760, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_52.run(*args, 1505280, grid=grid(1505280), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_52.benchmark_all_configs(*args, 1505280, grid=grid(1505280)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/cq/ccq24kfa5wdpfsiaanwmmkcev4g2dsxwzynkl2fv553qth6ixrfz.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_493 | |
triton_poi_fused__to_copy_53 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_53(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 6000 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((240, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((240, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_53.run(*args, 6000, grid=grid(6000), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_53.benchmark_all_configs(*args, 6000, grid=grid(6000)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/iu/ciu2jjei5ptmwjfdzudopltuvgs5jt6yeve7jcdvodi4ozwv2j6u.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_34 | |
triton_poi_fused_convolution_backward_54 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_54(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1505280 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = (xindex // 11760) | |
x3 = xindex % 11760 | |
x1 = (xindex // 49) % 240 | |
x4 = xindex | |
tmp0 = tl.load(in_ptr0 + (x3 + (47040*x2)), None) | |
tmp1 = tl.load(in_ptr1 + (x1), None) | |
tmp2 = tl.load(in_ptr2 + (x1), None) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp0 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((128, 240, 7, 7), (11760, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_54.run(*args, 1505280, grid=grid(1505280), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_54.benchmark_all_configs(*args, 1505280, grid=grid(1505280)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/vw/cvw5bsibqyft5fe6qyposdjgeixpccxa5ewo6nsyrtdsomfikzwt.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_494 | |
triton_poi_fused__to_copy_55 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_55(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2160 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((240, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((240, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_55.run(*args, 2160, grid=grid(2160), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_55.benchmark_all_configs(*args, 2160, grid=grid(2160)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/zd/czdohgrrggyfbylj276pqwjy7zlv5b2cu55bzfn6nk26nmdmxmon.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_47 | |
triton_poi_fused_cat_56 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_56(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 6021120 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 47040 | |
x1 = (xindex // 47040) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (188160*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 240, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_56.run(*args, 6021120, grid=grid(6021120), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_56.benchmark_all_configs(*args, 6021120, grid=grid(6021120)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/6g/c6guclv6h7xk7kazbyjy6st57pkopg5lzsmit37ifpr7d46xtfho.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.mul, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_307 | |
# aten.mul => mul_629 | |
# aten.native_batch_norm_backward => convert_element_type_495, mul_630, mul_638, sub_122, sum_38, sum_39 | |
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_57 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[1024, 32768], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_57(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 960 | |
rnumel = 25088 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
tmp7 = tl.load(in_ptr3 + (x0), xmask) | |
_tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 196 | |
r2 = (rindex // 196) | |
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (r1 + (196*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp5 = tl.load(in_ptr2 + (r1 + (196*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp2 = tmp0 * tmp1 | |
tmp3 = tmp2.to(tl.float32) | |
_tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4) | |
tmp6 = tmp5.to(tl.float32) | |
tmp8 = tmp6 - tmp7 | |
tmp9 = tmp3 * tmp8 | |
_tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10) | |
tmp4 = tl.sum(_tmp4, 1)[:, None] | |
tl.store(out_ptr0 + x0, tmp4, xmask) | |
tmp10 = tl.sum(_tmp10, 1)[:, None] | |
tl.store(out_ptr1 + x0, tmp10, xmask) | |
tmp11 = tl.load(in_ptr4 + (x0), xmask) | |
tmp12 = tmp10 * tmp11 | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp12, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_57.run(*args, 960, 25088, grid=grid(960), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_57.benchmark_all_configs(*args, 960, 25088, grid=grid(960)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/kq/ckqzmpbl3qlmroyipqv5z46lwldyab7axrjybp23plftx2joawdv.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.convolution_backward, aten.mul, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_307 | |
# aten.convolution_backward => convolution_backward_35 | |
# aten.mul => mul_629 | |
# aten.native_batch_norm_backward => convert_element_type_495, convert_element_type_497, mul_636, mul_637, sub_122, sub_124, sub_125 | |
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_58 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_58(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 24084480 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 196) % 960 | |
tmp0 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp4 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
tmp6 = tl.load(in_ptr2 + (x1), None) | |
tmp8 = tl.load(in_ptr3 + (x1), None) | |
tmp11 = tl.load(in_ptr4 + (x1), None) | |
tmp16 = tl.load(in_ptr5 + (x1), None) | |
tmp19 = tl.load(in_ptr6 + (x1), None) | |
tmp2 = tmp0 * tmp1 | |
tmp3 = tmp2.to(tl.float32) | |
tmp5 = tmp4.to(tl.float32) | |
tmp7 = tmp5 - tmp6 | |
tmp9 = 3.985969387755102e-05 | |
tmp10 = tmp8 * tmp9 | |
tmp12 = tmp11 * tmp11 | |
tmp13 = tmp10 * tmp12 | |
tmp14 = tmp7 * tmp13 | |
tmp15 = tmp3 - tmp14 | |
tmp17 = tmp16 * tmp9 | |
tmp18 = tmp15 - tmp17 | |
tmp20 = tmp11 * tmp19 | |
tmp21 = tmp18 * tmp20 | |
tmp22 = tmp21.to(tl.float32) | |
tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_58.run(*args, 24084480, grid=grid(24084480), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_58.benchmark_all_configs(*args, 24084480, grid=grid(24084480)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/v5/cv5qx4oyij3hkwbqtmi4skgsidu4qp4eyvxkday5hyy3p7hf74af.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_498 | |
triton_poi_fused__to_copy_59 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_59(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 153600 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None) | |
def get_args(): | |
arg_0 = rand_strided((960, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((960, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_59.run(*args, 153600, grid=grid(153600), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_59.benchmark_all_configs(*args, 153600, grid=grid(153600)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/vo/cvowglnuesyuof2jgfmeotsjew6qfwzd2dbspy34jb56dgnewazc.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_304 | |
# aten.native_batch_norm_backward => convert_element_type_499, mul_639, sub_126, sum_40, sum_41 | |
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_60 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[1024, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_60(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 640 | |
rnumel = 6272 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex % 160 | |
x1 = (xindex // 160) | |
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
tmp5 = tl.load(in_ptr2 + (x0), xmask) | |
_tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = tl.load(in_ptr0 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp3 = tl.load(in_ptr1 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
tmp4 = tmp3.to(tl.float32) | |
tmp6 = tmp4 - tmp5 | |
tmp7 = tmp1 * tmp6 | |
_tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8) | |
tmp2 = tl.sum(_tmp2, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp2, xmask) | |
tmp8 = tl.sum(_tmp8, 1)[:, None] | |
tl.store(out_ptr1 + x3, tmp8, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_60.run(*args, 640, 6272, grid=grid(640), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_60.benchmark_all_configs(*args, 640, 6272, grid=grid(640)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/v7/cv7kgtxh5cmxf5wfqyzhdvjjtikzk2pxpebuocdoaffoo64kkmaj.py | |
# Original ATen: aten.native_batch_norm_backward | |
# aten.native_batch_norm_backward => convert_element_type_499, sum_40 | |
triton_per_fused_native_batch_norm_backward_61 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[256, 4], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused_native_batch_norm_backward_61(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 160 | |
rnumel = 4 | |
RBLOCK: tl.constexpr = 4 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (160*r1)), rmask & xmask, other=0) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tl.store(out_ptr0 + x0, tmp3, xmask) | |
def get_args(): | |
arg_0 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused_native_batch_norm_backward_61.run(*args, 160, 4, grid=grid(160), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused_native_batch_norm_backward_61.benchmark_all_configs(*args, 160, 4, grid=grid(160)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ie/cie5izdtlxlumenajauzzkl6hk7z6q5jgsvyedwv7n3ohhf6bq2s.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_304 | |
# aten.native_batch_norm_backward => convert_element_type_499, mul_639, mul_647, sub_126, sum_41 | |
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_62 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[256, 4], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_62(in_ptr0, in_ptr1, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 160 | |
rnumel = 4 | |
RBLOCK: tl.constexpr = 4 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (160*r1)), rmask & xmask, other=0) | |
tmp4 = tl.load(in_ptr1 + (x0), xmask) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp5 = tmp3 * tmp4 | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask) | |
tl.store(out_ptr0 + x0, tmp3, xmask) | |
def get_args(): | |
arg_0 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_62.run(*args, 160, 4, grid=grid(160), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_62.benchmark_all_configs(*args, 160, 4, grid=grid(160)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/lu/clucluq3crlm7gshoftvwdbylf2hruep5zy2kdudpczi5ai6bwrm.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_304 | |
# aten.native_batch_norm_backward => convert_element_type_499, convert_element_type_501, mul_645, mul_646, sub_126, sub_128, sub_129 | |
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_63 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp16', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_63(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 4014080 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 196) % 160 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
tmp4 = tl.load(in_ptr2 + (x1), None) | |
tmp6 = tl.load(in_ptr3 + (x1), None) | |
tmp9 = tl.load(in_ptr4 + (x1), None) | |
tmp14 = tl.load(in_ptr5 + (x1), None) | |
tmp17 = tl.load(in_ptr6 + (x1), None) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp2.to(tl.float32) | |
tmp5 = tmp3 - tmp4 | |
tmp7 = 3.985969387755102e-05 | |
tmp8 = tmp6 * tmp7 | |
tmp10 = tmp9 * tmp9 | |
tmp11 = tmp8 * tmp10 | |
tmp12 = tmp5 * tmp11 | |
tmp13 = tmp1 - tmp12 | |
tmp15 = tmp14 * tmp7 | |
tmp16 = tmp13 - tmp15 | |
tmp18 = tmp9 * tmp17 | |
tmp19 = tmp16 * tmp18 | |
tmp20 = tmp19.to(tl.float32) | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp20, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_63.run(*args, 4014080, grid=grid(4014080), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_63.benchmark_all_configs(*args, 4014080, grid=grid(4014080)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/u6/cu6t7jgofejrpr6jiujpu5wwc5mlffc5k4hwyiqtb7tiyjxa6ec4.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_502 | |
triton_poi_fused__to_copy_64 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_64(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 19200 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_64.run(*args, 19200, grid=grid(19200), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_64.benchmark_all_configs(*args, 19200, grid=grid(19200)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/rj/crjqpfvxgdlaoka6zisl4dxwjeaxw453cmtsf23px34uebd6u4oo.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_48 | |
triton_poi_fused_cat_65 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_65(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 6021120 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 47040 | |
x1 = (xindex // 47040) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (94080*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 240, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_65.run(*args, 6021120, grid=grid(6021120), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_65.benchmark_all_configs(*args, 6021120, grid=grid(6021120)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/rc/crc555gyfmwt6m3oebfawzf77swacndeysz6litdvgmtz7aw3cnb.py | |
# Original ATen: aten.mul, aten.sigmoid, aten.sigmoid_backward, aten.silu, aten.sum | |
# aten.mul => mul_648 | |
# aten.sigmoid => sigmoid_47 | |
# aten.sigmoid_backward => convert_element_type_504, convert_element_type_505, convert_element_type_506, mul_650, mul_651, sub_130 | |
# aten.silu => convert_element_type_294, convert_element_type_295, mul_353, sigmoid_45 | |
# aten.sum => sum_42 | |
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_66 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[65536, 256], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_66(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 61440 | |
rnumel = 196 | |
RBLOCK: tl.constexpr = 256 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0)), rmask, other=0).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (r1 + (196*x0)), rmask, other=0).to(tl.float32) | |
tmp11 = tl.load(in_ptr2 + (x0), None).to(tl.float32) | |
tmp2 = tmp1.to(tl.float32) | |
tmp3 = tl.sigmoid(tmp2) | |
tmp4 = tmp2 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tmp6 = tmp0 * tmp5 | |
tmp8 = tl.where(rmask, tmp6, 0) | |
tmp9 = tl.sum(tmp8, 1)[:, None] | |
tmp10 = tmp9.to(tl.float32) | |
tmp12 = tl.sigmoid(tmp11) | |
tmp13 = tmp12.to(tl.float32) | |
tmp14 = 1.0 | |
tmp15 = tmp14 - tmp13 | |
tmp16 = tmp13 * tmp15 | |
tmp17 = tmp10 * tmp16 | |
tmp18 = tmp17.to(tl.float32) | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp18, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_66.run(*args, 61440, 196, grid=grid(61440), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_66.benchmark_all_configs(*args, 61440, 196, grid=grid(61440)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/n4/cn4fg2jpjbb2zgpwsyrzkkvrz4ql4zzxfplbwfmfeem7estzylry.py | |
# Original ATen: aten._to_copy, aten.convolution_backward | |
# aten._to_copy => convert_element_type_508 | |
# aten.convolution_backward => sum_43 | |
triton_per_fused__to_copy_convolution_backward_67 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[512, 128], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__to_copy_convolution_backward_67(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 480 | |
rnumel = 128 | |
RBLOCK: tl.constexpr = 128 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (480*r1)), rmask & xmask, other=0).to(tl.float32) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__to_copy_convolution_backward_67.run(*args, 480, 128, grid=grid(480), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__to_copy_convolution_backward_67.benchmark_all_configs(*args, 480, 128, grid=grid(480)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/gx/cgx4he25cg6tmu62uuz3hbvmsgstxlvguwwpdavmmk5v7cb53khs.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_507 | |
triton_poi_fused__to_copy_68 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_68(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 38400 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_68.run(*args, 38400, grid=grid(38400), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_68.benchmark_all_configs(*args, 38400, grid=grid(38400)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/57/c57nn4lxfd7xfnwbsod5xu57xcqkrswf7vdbyz76bditl462irc4.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_509 | |
triton_poi_fused__to_copy_69 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_69(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 38400 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_69.run(*args, 38400, grid=grid(38400), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_69.benchmark_all_configs(*args, 38400, grid=grid(38400)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/5q/c5qqp6x25kopkdsx6gj3r2vuos77mpwiiyja2ob3sfsgwvxevysq.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub | |
# aten._native_batch_norm_legit_functional => convert_element_type_292 | |
# aten.add => add_324, add_325 | |
# aten.clone => clone_34 | |
# aten.div => div_5 | |
# aten.fill => full_like_13 | |
# aten.mul => mul_649, mul_655, mul_656, mul_657 | |
# aten.native_batch_norm_backward => convert_element_type_511, mul_658, mul_666, sub_133, sum_45, sum_46 | |
# aten.sigmoid => sigmoid_47, sigmoid_77 | |
# aten.sub => sub_132 | |
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_70 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[512, 32768], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_70(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 480 | |
rnumel = 25088 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
tmp20 = tl.load(in_ptr5 + (x0), xmask) | |
_tmp23 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 196 | |
r2 = (rindex // 196) | |
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (x0 + (480*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp4 = tl.load(in_ptr2 + (x0 + (480*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp8 = tl.load(in_ptr3 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp18 = tl.load(in_ptr4 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp0 * tmp2 | |
tmp5 = 196.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = tmp3 + tmp6 | |
tmp9 = tl.sigmoid(tmp8) | |
tmp10 = 1.0 | |
tmp11 = tmp10 - tmp9 | |
tmp12 = tmp8 * tmp11 | |
tmp13 = tmp12 + tmp10 | |
tmp14 = tmp9 * tmp13 | |
tmp15 = tmp7 * tmp14 | |
tmp16 = tmp15.to(tl.float32) | |
_tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17) | |
tmp19 = tmp18.to(tl.float32) | |
tmp21 = tmp19 - tmp20 | |
tmp22 = tmp16 * tmp21 | |
_tmp23 = tl.where(rmask & xmask, _tmp23 + tmp22, _tmp23) | |
tmp17 = tl.sum(_tmp17, 1)[:, None] | |
tl.store(out_ptr0 + x0, tmp17, xmask) | |
tmp23 = tl.sum(_tmp23, 1)[:, None] | |
tl.store(out_ptr1 + x0, tmp23, xmask) | |
tmp24 = tl.load(in_ptr6 + (x0), xmask) | |
tmp25 = tmp23 * tmp24 | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp25, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_5 = rand_strided((1, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_9 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_70.run(*args, 480, 25088, grid=grid(480), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_70.benchmark_all_configs(*args, 480, 25088, grid=grid(480)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/qc/cqcavzge4fgj7thkzvjlxnonwb5xlzaboopggaaauvq4wfdtklsv.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub | |
# aten._native_batch_norm_legit_functional => convert_element_type_292 | |
# aten.add => add_324, add_325 | |
# aten.clone => clone_34 | |
# aten.div => div_5 | |
# aten.fill => full_like_13 | |
# aten.mul => mul_649, mul_655, mul_656, mul_657 | |
# aten.native_batch_norm_backward => convert_element_type_511, mul_664, sub_133, sub_135, sub_136 | |
# aten.sigmoid => sigmoid_47, sigmoid_77 | |
# aten.sub => sub_132 | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_71 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_71(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 12042240 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x4 = (xindex // 196) | |
x1 = (xindex // 196) % 480 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32) | |
tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32) | |
tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32) | |
tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32) | |
tmp19 = tl.load(in_ptr5 + (x1), None) | |
tmp21 = tl.load(in_ptr6 + (x1), None) | |
tmp24 = tl.load(in_ptr7 + (x1), None) | |
tmp29 = tl.load(in_ptr8 + (x1), None) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp0 * tmp2 | |
tmp5 = 196.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = tmp3 + tmp6 | |
tmp9 = tl.sigmoid(tmp8) | |
tmp10 = 1.0 | |
tmp11 = tmp10 - tmp9 | |
tmp12 = tmp8 * tmp11 | |
tmp13 = tmp12 + tmp10 | |
tmp14 = tmp9 * tmp13 | |
tmp15 = tmp7 * tmp14 | |
tmp16 = tmp15.to(tl.float32) | |
tmp18 = tmp17.to(tl.float32) | |
tmp20 = tmp18 - tmp19 | |
tmp22 = 3.985969387755102e-05 | |
tmp23 = tmp21 * tmp22 | |
tmp25 = tmp24 * tmp24 | |
tmp26 = tmp23 * tmp25 | |
tmp27 = tmp20 * tmp26 | |
tmp28 = tmp16 - tmp27 | |
tmp30 = tmp29 * tmp22 | |
tmp31 = tmp28 - tmp30 | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp31, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_5 = rand_strided((1, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_9 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_71.run(*args, 12042240, grid=grid(12042240), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_71.benchmark_all_configs(*args, 12042240, grid=grid(12042240)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ya/cya3odioikqnsayl73izdekuxqhs5rqfkr2pft6i3rty3tyluanc.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_40 | |
triton_poi_fused_convolution_backward_72 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_72(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3010560 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = (xindex // 23520) | |
x3 = xindex % 23520 | |
x1 = (xindex // 196) % 120 | |
x4 = xindex | |
tmp0 = tl.load(in_ptr0 + (70560 + x3 + (94080*x2)), None) | |
tmp1 = tl.load(in_ptr1 + (360 + x1), None) | |
tmp2 = tl.load(in_ptr2 + (360 + x1), None) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp0 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_72.run(*args, 3010560, grid=grid(3010560), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_72.benchmark_all_configs(*args, 3010560, grid=grid(3010560)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/fe/cfetbcwbedjsjshgd4bjuqpljqnapb5dcgp46ajfwwvj2dk7ne2i.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_514 | |
triton_poi_fused__to_copy_73 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_73(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 9720 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_73.run(*args, 9720, grid=grid(9720), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_73.benchmark_all_configs(*args, 9720, grid=grid(9720)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/iq/ciqofepsagke43sfmdrbzwfpkfw4zvr3els5r3zwk7gigvnpcypw.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_41 | |
triton_poi_fused_convolution_backward_74 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_74(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3010560 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = (xindex // 23520) | |
x3 = xindex % 23520 | |
x1 = (xindex // 196) % 120 | |
x4 = xindex | |
tmp0 = tl.load(in_ptr0 + (47040 + x3 + (94080*x2)), None) | |
tmp1 = tl.load(in_ptr1 + (240 + x1), None) | |
tmp2 = tl.load(in_ptr2 + (240 + x1), None) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp0 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_74.run(*args, 3010560, grid=grid(3010560), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_74.benchmark_all_configs(*args, 3010560, grid=grid(3010560)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/dy/cdybcagplfulpsthbc6wmswmxelkrixxqqrrt72daichdjv2sefi.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_515 | |
triton_poi_fused__to_copy_75 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_75(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 5880 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_75.run(*args, 5880, grid=grid(5880), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_75.benchmark_all_configs(*args, 5880, grid=grid(5880)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/uy/cuyidmj6qefewok45jcr4rfmsmcj6hczovevzzaxuc52uf6hh3nc.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_42 | |
triton_poi_fused_convolution_backward_76 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_76(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3010560 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = (xindex // 23520) | |
x3 = xindex % 23520 | |
x1 = (xindex // 196) % 120 | |
x4 = xindex | |
tmp0 = tl.load(in_ptr0 + (23520 + x3 + (94080*x2)), None) | |
tmp1 = tl.load(in_ptr1 + (120 + x1), None) | |
tmp2 = tl.load(in_ptr2 + (120 + x1), None) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp0 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_76.run(*args, 3010560, grid=grid(3010560), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_76.benchmark_all_configs(*args, 3010560, grid=grid(3010560)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ux/cuxkdni67km5gfbdihbo5i44iivskoy6hdt7gubbh5qgb5mkwsob.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_516 | |
triton_poi_fused__to_copy_77 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_77(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3000 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_77.run(*args, 3000, grid=grid(3000), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_77.benchmark_all_configs(*args, 3000, grid=grid(3000)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/4r/c4rld6qtfki5aedwz7g3eqwmnt7w4mmzwymiotpz4adjvg43hqwk.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_43 | |
triton_poi_fused_convolution_backward_78 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_78(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3010560 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = (xindex // 23520) | |
x3 = xindex % 23520 | |
x1 = (xindex // 196) % 120 | |
x4 = xindex | |
tmp0 = tl.load(in_ptr0 + (x3 + (94080*x2)), None) | |
tmp1 = tl.load(in_ptr1 + (x1), None) | |
tmp2 = tl.load(in_ptr2 + (x1), None) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp0 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_78.run(*args, 3010560, grid=grid(3010560), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_78.benchmark_all_configs(*args, 3010560, grid=grid(3010560)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ri/crixsdzqwpepdejipqi3ixt4vlyrmrskq5hgkzn7jumpuurao5h5.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_517 | |
triton_poi_fused__to_copy_79 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_79(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1080 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_79.run(*args, 1080, grid=grid(1080), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_79.benchmark_all_configs(*args, 1080, grid=grid(1080)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ty/ctycfmevxfpn542kh5agvjqilg3o5r77udr2h3wyqu23mqhlqdxf.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_49 | |
triton_poi_fused_cat_80 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_80(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3010560 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 23520 | |
x1 = (xindex // 23520) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (94080*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 120, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_80.run(*args, 3010560, grid=grid(3010560), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_80.benchmark_all_configs(*args, 3010560, grid=grid(3010560)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/7i/c7iiiqxahzgxonlzcskqwzfumjrx6cupqeinuz5pkzk66wqnnnyc.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.mul, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_284 | |
# aten.mul => mul_669 | |
# aten.native_batch_norm_backward => convert_element_type_518, mul_670, mul_678, sub_138, sum_47, sum_48 | |
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_81 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[512, 32768], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_81(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 480 | |
rnumel = 25088 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
tmp7 = tl.load(in_ptr3 + (x0), xmask) | |
_tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 196 | |
r2 = (rindex // 196) | |
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp5 = tl.load(in_ptr2 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp2 = tmp0 * tmp1 | |
tmp3 = tmp2.to(tl.float32) | |
_tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4) | |
tmp6 = tmp5.to(tl.float32) | |
tmp8 = tmp6 - tmp7 | |
tmp9 = tmp3 * tmp8 | |
_tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10) | |
tmp4 = tl.sum(_tmp4, 1)[:, None] | |
tl.store(out_ptr0 + x0, tmp4, xmask) | |
tmp10 = tl.sum(_tmp10, 1)[:, None] | |
tl.store(out_ptr1 + x0, tmp10, xmask) | |
tmp11 = tl.load(in_ptr4 + (x0), xmask) | |
tmp12 = tmp10 * tmp11 | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp12, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_81.run(*args, 480, 25088, grid=grid(480), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_81.benchmark_all_configs(*args, 480, 25088, grid=grid(480)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/v4/cv4k7goz3fg6grlbcjwhdwg6ck3vmlxtj3l76l5phqjwc5cao5b3.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.mul, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_284 | |
# aten.mul => mul_669 | |
# aten.native_batch_norm_backward => convert_element_type_518, convert_element_type_520, mul_676, mul_677, sub_138, sub_140, sub_141 | |
triton_poi_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_82 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_82(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 12042240 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 196) % 480 | |
tmp0 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp4 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
tmp6 = tl.load(in_ptr2 + (x1), None) | |
tmp8 = tl.load(in_ptr3 + (x1), None) | |
tmp11 = tl.load(in_ptr4 + (x1), None) | |
tmp16 = tl.load(in_ptr5 + (x1), None) | |
tmp19 = tl.load(in_ptr6 + (x1), None) | |
tmp2 = tmp0 * tmp1 | |
tmp3 = tmp2.to(tl.float32) | |
tmp5 = tmp4.to(tl.float32) | |
tmp7 = tmp5 - tmp6 | |
tmp9 = 3.985969387755102e-05 | |
tmp10 = tmp8 * tmp9 | |
tmp12 = tmp11 * tmp11 | |
tmp13 = tmp10 * tmp12 | |
tmp14 = tmp7 * tmp13 | |
tmp15 = tmp3 - tmp14 | |
tmp17 = tmp16 * tmp9 | |
tmp18 = tmp15 - tmp17 | |
tmp20 = tmp11 * tmp19 | |
tmp21 = tmp18 * tmp20 | |
tmp22 = tmp21.to(tl.float32) | |
tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_82.run(*args, 12042240, grid=grid(12042240), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_82.benchmark_all_configs(*args, 12042240, grid=grid(12042240)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/om/comunyk6xj4yo26dm6rnf6zkadn3bzlwf2dwxosduxrcmjkwwa6u.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_521 | |
triton_poi_fused__to_copy_83 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_83(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 19200 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_83.run(*args, 19200, grid=grid(19200), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_83.benchmark_all_configs(*args, 19200, grid=grid(19200)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/2y/c2yommjpqgw2nsvcapep2mob6welg6j7akfq7uc4xaaiuzwdalt7.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_50 | |
triton_poi_fused_cat_84 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_84(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2007040 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 15680 | |
x1 = (xindex // 15680) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (31360*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 80, 14, 14), (15680, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 80, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_84.run(*args, 2007040, grid=grid(2007040), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_84.benchmark_all_configs(*args, 2007040, grid=grid(2007040)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/zy/czyhduwjwdj7bjl64zkqqaz3rymv5xt54uovrwuvmbs3ylr4n3xb.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_280 | |
# aten.add => add_327 | |
# aten.native_batch_norm_backward => convert_element_type_523, mul_679, sub_142, sum_49, sum_50 | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_85 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[1024, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: 'i32', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_85(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 640 | |
rnumel = 6272 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex % 160 | |
x1 = (xindex // 160) | |
_tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
tmp7 = tl.load(in_ptr3 + (x0), xmask) | |
_tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = tl.load(in_ptr0 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp5 = tl.load(in_ptr2 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp2 = tmp0 + tmp1 | |
tmp3 = tmp2.to(tl.float32) | |
_tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4) | |
tmp6 = tmp5.to(tl.float32) | |
tmp8 = tmp6 - tmp7 | |
tmp9 = tmp3 * tmp8 | |
_tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10) | |
tmp4 = tl.sum(_tmp4, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp4, xmask) | |
tmp10 = tl.sum(_tmp10, 1)[:, None] | |
tl.store(out_ptr1 + x3, tmp10, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_85.run(*args, 640, 6272, grid=grid(640), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_85.benchmark_all_configs(*args, 640, 6272, grid=grid(640)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/5n/c5ngqp5cp6ieu5sc7n3ylgpltjkzcel7tc4dwx3ctbhj3ayamet2.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_280 | |
# aten.add => add_327 | |
# aten.native_batch_norm_backward => convert_element_type_523, convert_element_type_525, mul_685, mul_686, sub_142, sub_144, sub_145 | |
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_86 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp16', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_86(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 4014080 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 196) % 160 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
tmp4 = tl.load(in_ptr2 + (x3), None).to(tl.float32) | |
tmp6 = tl.load(in_ptr3 + (x1), None) | |
tmp8 = tl.load(in_ptr4 + (x1), None) | |
tmp11 = tl.load(in_ptr5 + (x1), None) | |
tmp16 = tl.load(in_ptr6 + (x1), None) | |
tmp19 = tl.load(in_ptr7 + (x1), None) | |
tmp2 = tmp0 + tmp1 | |
tmp3 = tmp2.to(tl.float32) | |
tmp5 = tmp4.to(tl.float32) | |
tmp7 = tmp5 - tmp6 | |
tmp9 = 3.985969387755102e-05 | |
tmp10 = tmp8 * tmp9 | |
tmp12 = tmp11 * tmp11 | |
tmp13 = tmp10 * tmp12 | |
tmp14 = tmp7 * tmp13 | |
tmp15 = tmp3 - tmp14 | |
tmp17 = tmp16 * tmp9 | |
tmp18 = tmp15 - tmp17 | |
tmp20 = tmp11 * tmp19 | |
tmp21 = tmp18 * tmp20 | |
tmp22 = tmp21.to(tl.float32) | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_86.run(*args, 4014080, grid=grid(4014080), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_86.benchmark_all_configs(*args, 4014080, grid=grid(4014080)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/mi/cmi2g34rcnn4o7hkiq27yemv23mvhmf7ilu2wl6n5igdib4lbn3v.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_256 | |
# aten.add => add_327, add_332 | |
# aten.native_batch_norm_backward => convert_element_type_547, mul_719, sub_158, sum_58, sum_59 | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_87 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[1024, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_87(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 640 | |
rnumel = 6272 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex % 160 | |
x1 = (xindex // 160) | |
_tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
tmp9 = tl.load(in_ptr4 + (x0), xmask) | |
_tmp12 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = tl.load(in_ptr0 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp3 = tl.load(in_ptr2 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp7 = tl.load(in_ptr3 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp2 = tmp0 + tmp1 | |
tmp4 = tmp2 + tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
_tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6) | |
tmp8 = tmp7.to(tl.float32) | |
tmp10 = tmp8 - tmp9 | |
tmp11 = tmp5 * tmp10 | |
_tmp12 = tl.where(rmask & xmask, _tmp12 + tmp11, _tmp12) | |
tmp6 = tl.sum(_tmp6, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp6, xmask) | |
tmp12 = tl.sum(_tmp12, 1)[:, None] | |
tl.store(out_ptr1 + x3, tmp12, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_87.run(*args, 640, 6272, grid=grid(640), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_87.benchmark_all_configs(*args, 640, 6272, grid=grid(640)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/5m/c5msj6p2vdag4fs3dze5gd2kastkwiprxfqobb4ipbu3zwk6y4tb.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_256 | |
# aten.add => add_327, add_332 | |
# aten.native_batch_norm_backward => convert_element_type_547, mul_725, mul_726, sub_158, sub_160, sub_161 | |
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_88 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_88(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 4014080 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 196) % 160 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
tmp3 = tl.load(in_ptr2 + (x3), None).to(tl.float32) | |
tmp6 = tl.load(in_ptr3 + (x3), None).to(tl.float32) | |
tmp8 = tl.load(in_ptr4 + (x1), None) | |
tmp10 = tl.load(in_ptr5 + (x1), None) | |
tmp13 = tl.load(in_ptr6 + (x1), None) | |
tmp18 = tl.load(in_ptr7 + (x1), None) | |
tmp21 = tl.load(in_ptr8 + (x1), None) | |
tmp2 = tmp0 + tmp1 | |
tmp4 = tmp2 + tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tmp7 = tmp6.to(tl.float32) | |
tmp9 = tmp7 - tmp8 | |
tmp11 = 3.985969387755102e-05 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp13 * tmp13 | |
tmp15 = tmp12 * tmp14 | |
tmp16 = tmp9 * tmp15 | |
tmp17 = tmp5 - tmp16 | |
tmp19 = tmp18 * tmp11 | |
tmp20 = tmp17 - tmp19 | |
tmp22 = tmp13 * tmp21 | |
tmp23 = tmp20 * tmp22 | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp23, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_9 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_88.run(*args, 4014080, grid=grid(4014080), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_88.benchmark_all_configs(*args, 4014080, grid=grid(4014080)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/33/c33mk7jbf3hbussw7bzc7k6jlfrdr6q6k2jcx5wwhkb43usq3f3k.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_56 | |
triton_poi_fused_convolution_backward_89 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_89(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2007040 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 15680 | |
x1 = (xindex // 15680) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (15680 + x0 + (31360*x1)), None) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((128, 80, 14, 14), (15680, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_89.run(*args, 2007040, grid=grid(2007040), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_89.benchmark_all_configs(*args, 2007040, grid=grid(2007040)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/qs/cqsgmcz5zjvb6ko7kxhkl5io7i4gmvcpy5zxwjwu7qjx5oya6hd3.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_57 | |
triton_poi_fused_convolution_backward_90 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_90(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2007040 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 15680 | |
x1 = (xindex // 15680) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (31360*x1)), None) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((128, 80, 14, 14), (15680, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_90.run(*args, 2007040, grid=grid(2007040), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_90.benchmark_all_configs(*args, 2007040, grid=grid(2007040)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/is/cis35c7i52ytm3zlhduqhkylrq33chwilt3nh2nr2nfagpbn72d4.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_232 | |
# aten.add => add_327, add_332, add_337 | |
# aten.native_batch_norm_backward => convert_element_type_571, mul_759, sub_174, sum_67, sum_68 | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_91 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[1024, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_91(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 640 | |
rnumel = 6272 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex % 160 | |
x1 = (xindex // 160) | |
_tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
tmp11 = tl.load(in_ptr5 + (x0), xmask) | |
_tmp14 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = tl.load(in_ptr0 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp3 = tl.load(in_ptr2 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp5 = tl.load(in_ptr3 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp9 = tl.load(in_ptr4 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp2 = tmp0 + tmp1 | |
tmp4 = tmp2 + tmp3 | |
tmp6 = tmp4 + tmp5 | |
tmp7 = tmp6.to(tl.float32) | |
_tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8) | |
tmp10 = tmp9.to(tl.float32) | |
tmp12 = tmp10 - tmp11 | |
tmp13 = tmp7 * tmp12 | |
_tmp14 = tl.where(rmask & xmask, _tmp14 + tmp13, _tmp14) | |
tmp8 = tl.sum(_tmp8, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp8, xmask) | |
tmp14 = tl.sum(_tmp14, 1)[:, None] | |
tl.store(out_ptr1 + x3, tmp14, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_5 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_91.run(*args, 640, 6272, grid=grid(640), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_91.benchmark_all_configs(*args, 640, 6272, grid=grid(640)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/lj/cljkpj67jhpiomhay7l2fuqfmjz2fox7ndvjbixox242vrhfypjf.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.convolution_backward, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_232 | |
# aten.add => add_327, add_332, add_337 | |
# aten.convolution_backward => convolution_backward_66 | |
# aten.native_batch_norm_backward => convert_element_type_571, convert_element_type_573, mul_765, mul_766, sub_174, sub_176, sub_177 | |
triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_92 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: '*fp16', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_92(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, in_ptr9, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 4014080 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 196) % 160 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
tmp3 = tl.load(in_ptr2 + (x3), None).to(tl.float32) | |
tmp5 = tl.load(in_ptr3 + (x3), None).to(tl.float32) | |
tmp8 = tl.load(in_ptr4 + (x3), None).to(tl.float32) | |
tmp10 = tl.load(in_ptr5 + (x1), None) | |
tmp12 = tl.load(in_ptr6 + (x1), None) | |
tmp15 = tl.load(in_ptr7 + (x1), None) | |
tmp20 = tl.load(in_ptr8 + (x1), None) | |
tmp23 = tl.load(in_ptr9 + (x1), None) | |
tmp2 = tmp0 + tmp1 | |
tmp4 = tmp2 + tmp3 | |
tmp6 = tmp4 + tmp5 | |
tmp7 = tmp6.to(tl.float32) | |
tmp9 = tmp8.to(tl.float32) | |
tmp11 = tmp9 - tmp10 | |
tmp13 = 3.985969387755102e-05 | |
tmp14 = tmp12 * tmp13 | |
tmp16 = tmp15 * tmp15 | |
tmp17 = tmp14 * tmp16 | |
tmp18 = tmp11 * tmp17 | |
tmp19 = tmp7 - tmp18 | |
tmp21 = tmp20 * tmp13 | |
tmp22 = tmp19 - tmp21 | |
tmp24 = tmp15 * tmp23 | |
tmp25 = tmp22 * tmp24 | |
tmp26 = tmp25.to(tl.float32) | |
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp26, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_5 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_9 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_10 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, arg_10, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_92.run(*args, 4014080, grid=grid(4014080), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_92.benchmark_all_configs(*args, 4014080, grid=grid(4014080)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/af/cafjjp3a3kwtimi2i4kv3zpwxjdfuxdqkq7szn2vypz2ywexppw7.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_574 | |
triton_poi_fused__to_copy_93 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_93(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 99840 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((160, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((160, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_93.run(*args, 99840, grid=grid(99840), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_93.benchmark_all_configs(*args, 99840, grid=grid(99840)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ic/cicgaxe2oqufzttvfjscmbopri2uxa3mejzwrav4jbahpmbb4rby.py | |
# Original ATen: aten.mul, aten.sigmoid, aten.sigmoid_backward, aten.silu, aten.sum | |
# aten.mul => mul_768 | |
# aten.sigmoid => sigmoid_35 | |
# aten.sigmoid_backward => convert_element_type_575, convert_element_type_576, convert_element_type_577, mul_770, mul_771, sub_178 | |
# aten.silu => convert_element_type_223, convert_element_type_224, mul_278, sigmoid_33 | |
# aten.sum => sum_69 | |
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_94 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[131072, 256], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_94(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 79872 | |
rnumel = 196 | |
RBLOCK: tl.constexpr = 256 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0)), rmask, other=0).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (r1 + (196*x0)), rmask, other=0).to(tl.float32) | |
tmp11 = tl.load(in_ptr2 + (x0), None).to(tl.float32) | |
tmp2 = tmp1.to(tl.float32) | |
tmp3 = tl.sigmoid(tmp2) | |
tmp4 = tmp2 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tmp6 = tmp0 * tmp5 | |
tmp8 = tl.where(rmask, tmp6, 0) | |
tmp9 = tl.sum(tmp8, 1)[:, None] | |
tmp10 = tmp9.to(tl.float32) | |
tmp12 = tl.sigmoid(tmp11) | |
tmp13 = tmp12.to(tl.float32) | |
tmp14 = 1.0 | |
tmp15 = tmp14 - tmp13 | |
tmp16 = tmp13 * tmp15 | |
tmp17 = tmp10 * tmp16 | |
tmp18 = tmp17.to(tl.float32) | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp18, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_94.run(*args, 79872, 196, grid=grid(79872), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_94.benchmark_all_configs(*args, 79872, 196, grid=grid(79872)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/sw/cswrgzqc55nc76ttfypz7ruvqwdkjksusjytknavxbcop6g2otp2.py | |
# Original ATen: aten._to_copy, aten.convolution_backward | |
# aten._to_copy => convert_element_type_579 | |
# aten.convolution_backward => sum_70 | |
triton_per_fused__to_copy_convolution_backward_95 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[1024, 128], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__to_copy_convolution_backward_95(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 624 | |
rnumel = 128 | |
RBLOCK: tl.constexpr = 128 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (624*r1)), rmask & xmask, other=0).to(tl.float32) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__to_copy_convolution_backward_95.run(*args, 624, 128, grid=grid(624), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__to_copy_convolution_backward_95.benchmark_all_configs(*args, 624, 128, grid=grid(624)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/rk/crkgwzo3s6rzbbnalsvhwyanngv4n2zgzifhbhpcnc5ysqffl6ac.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_578 | |
triton_poi_fused__to_copy_96 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_96(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 32448 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((624, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((624, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_96.run(*args, 32448, grid=grid(32448), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_96.benchmark_all_configs(*args, 32448, grid=grid(32448)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/wf/cwfpzmbbmgs237i4rn23twa6vqls763iyjrp76njdwwsj3et2uaf.py | |
# Original ATen: aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub | |
# aten.add => add_338 | |
# aten.clone => clone_26 | |
# aten.fill => full_like_21 | |
# aten.mul => mul_772, mul_773, mul_774 | |
# aten.sigmoid => sigmoid_85 | |
# aten.sub => sub_179 | |
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_97 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_add_clone_fill_mul_sigmoid_sub_97(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 6656 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = 1.0 | |
tmp4 = tmp3 - tmp2 | |
tmp5 = tmp1 * tmp4 | |
tmp6 = tmp5 + tmp3 | |
tmp7 = tmp2 * tmp6 | |
tmp8 = tmp0 * tmp7 | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp8, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_97.run(*args, 6656, grid=grid(6656), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_add_clone_fill_mul_sigmoid_sub_97.benchmark_all_configs(*args, 6656, grid=grid(6656)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/xt/cxt55nhjkmnbxqvfatybarolu6uxihwoo3cct34gda356pn7scly.py | |
# Original ATen: aten._to_copy, aten.convolution_backward | |
# aten._to_copy => convert_element_type_581 | |
# aten.convolution_backward => sum_71 | |
triton_per_fused__to_copy_convolution_backward_98 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[64, 128], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 3), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__to_copy_convolution_backward_98(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 52 | |
rnumel = 128 | |
RBLOCK: tl.constexpr = 128 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (52*r1)), rmask & xmask, other=0).to(tl.float32) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((52,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__to_copy_convolution_backward_98.run(*args, 52, 128, grid=grid(52), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__to_copy_convolution_backward_98.benchmark_all_configs(*args, 52, 128, grid=grid(52)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/tw/ctwfvbwslvyki3iv4byjhnpgwt4ujawez7njntlfraulnuse3ddu.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_580 | |
triton_poi_fused__to_copy_99 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_99(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 32448 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((52, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((52, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_99.run(*args, 32448, grid=grid(32448), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_99.benchmark_all_configs(*args, 32448, grid=grid(32448)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/6w/c6wzolzrfmxblwjxnpxkgqezmro2wot3nkz62hajpunjmpkn6enk.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub | |
# aten._native_batch_norm_legit_functional => convert_element_type_221 | |
# aten.add => add_339, add_340 | |
# aten.clone => clone_25 | |
# aten.div => div_8 | |
# aten.fill => full_like_22 | |
# aten.mul => mul_769, mul_775, mul_776, mul_777 | |
# aten.native_batch_norm_backward => convert_element_type_582, mul_778, mul_786, sub_181, sum_72, sum_73 | |
# aten.sigmoid => sigmoid_35, sigmoid_86 | |
# aten.sub => sub_180 | |
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_100 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[1024, 32768], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_100(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 624 | |
rnumel = 25088 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
tmp20 = tl.load(in_ptr5 + (x0), xmask) | |
_tmp23 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 196 | |
r2 = (rindex // 196) | |
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (x0 + (624*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp4 = tl.load(in_ptr2 + (x0 + (624*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp8 = tl.load(in_ptr3 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp18 = tl.load(in_ptr4 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp0 * tmp2 | |
tmp5 = 196.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = tmp3 + tmp6 | |
tmp9 = tl.sigmoid(tmp8) | |
tmp10 = 1.0 | |
tmp11 = tmp10 - tmp9 | |
tmp12 = tmp8 * tmp11 | |
tmp13 = tmp12 + tmp10 | |
tmp14 = tmp9 * tmp13 | |
tmp15 = tmp7 * tmp14 | |
tmp16 = tmp15.to(tl.float32) | |
_tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17) | |
tmp19 = tmp18.to(tl.float32) | |
tmp21 = tmp19 - tmp20 | |
tmp22 = tmp16 * tmp21 | |
_tmp23 = tl.where(rmask & xmask, _tmp23 + tmp22, _tmp23) | |
tmp17 = tl.sum(_tmp17, 1)[:, None] | |
tl.store(out_ptr0 + x0, tmp17, xmask) | |
tmp23 = tl.sum(_tmp23, 1)[:, None] | |
tl.store(out_ptr1 + x0, tmp23, xmask) | |
tmp24 = tl.load(in_ptr6 + (x0), xmask) | |
tmp25 = tmp23 * tmp24 | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp25, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_5 = rand_strided((1, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_9 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_100.run(*args, 624, 25088, grid=grid(624), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_100.benchmark_all_configs(*args, 624, 25088, grid=grid(624)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/d3/cd3muetrknv5a272355llgd7oce4qcl4nznkklo4quwqquswhe2k.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.convolution_backward, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub | |
# aten._native_batch_norm_legit_functional => convert_element_type_221 | |
# aten.add => add_339, add_340 | |
# aten.clone => clone_25 | |
# aten.convolution_backward => convolution_backward_69 | |
# aten.div => div_8 | |
# aten.fill => full_like_22 | |
# aten.mul => mul_769, mul_775, mul_776, mul_777 | |
# aten.native_batch_norm_backward => convert_element_type_582, convert_element_type_584, mul_784, mul_785, sub_181, sub_183, sub_184 | |
# aten.sigmoid => sigmoid_35, sigmoid_86 | |
# aten.sub => sub_180 | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_convolution_backward_div_fill_mul_native_batch_norm_backward_sigmoid_sub_101 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: '*fp16', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_convolution_backward_div_fill_mul_native_batch_norm_backward_sigmoid_sub_101(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, in_ptr9, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 15654912 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x4 = (xindex // 196) | |
x1 = (xindex // 196) % 624 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32) | |
tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32) | |
tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32) | |
tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32) | |
tmp19 = tl.load(in_ptr5 + (x1), None) | |
tmp21 = tl.load(in_ptr6 + (x1), None) | |
tmp24 = tl.load(in_ptr7 + (x1), None) | |
tmp29 = tl.load(in_ptr8 + (x1), None) | |
tmp32 = tl.load(in_ptr9 + (x1), None) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp0 * tmp2 | |
tmp5 = 196.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = tmp3 + tmp6 | |
tmp9 = tl.sigmoid(tmp8) | |
tmp10 = 1.0 | |
tmp11 = tmp10 - tmp9 | |
tmp12 = tmp8 * tmp11 | |
tmp13 = tmp12 + tmp10 | |
tmp14 = tmp9 * tmp13 | |
tmp15 = tmp7 * tmp14 | |
tmp16 = tmp15.to(tl.float32) | |
tmp18 = tmp17.to(tl.float32) | |
tmp20 = tmp18 - tmp19 | |
tmp22 = 3.985969387755102e-05 | |
tmp23 = tmp21 * tmp22 | |
tmp25 = tmp24 * tmp24 | |
tmp26 = tmp23 * tmp25 | |
tmp27 = tmp20 * tmp26 | |
tmp28 = tmp16 - tmp27 | |
tmp30 = tmp29 * tmp22 | |
tmp31 = tmp28 - tmp30 | |
tmp33 = tmp24 * tmp32 | |
tmp34 = tmp31 * tmp33 | |
tmp35 = tmp34.to(tl.float32) | |
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp35, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_5 = rand_strided((1, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_9 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_10 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, arg_10, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_convolution_backward_div_fill_mul_native_batch_norm_backward_sigmoid_sub_101.run(*args, 15654912, grid=grid(15654912), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_convolution_backward_div_fill_mul_native_batch_norm_backward_sigmoid_sub_101.benchmark_all_configs(*args, 15654912, grid=grid(15654912)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/23/c23cf4zcm2zsqnix7cwevl5uqnq6e7k4nrleu5hxvvucflqj7ztk.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_585 | |
triton_poi_fused__to_copy_102 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_102(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 5616 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((624, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((624, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_102.run(*args, 5616, grid=grid(5616), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_102.benchmark_all_configs(*args, 5616, grid=grid(5616)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/xe/cxe5wowu7imk523mjj4gviz3cjwfkjjg7yenewa23cv4xlyi265g.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.mul, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_216 | |
# aten.mul => mul_789 | |
# aten.native_batch_norm_backward => convert_element_type_586, mul_790, mul_798, sub_186, sum_74, sum_75 | |
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_103 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[1024, 32768], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_103(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 624 | |
rnumel = 25088 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
tmp7 = tl.load(in_ptr3 + (x0), xmask) | |
_tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 196 | |
r2 = (rindex // 196) | |
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp5 = tl.load(in_ptr2 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp2 = tmp0 * tmp1 | |
tmp3 = tmp2.to(tl.float32) | |
_tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4) | |
tmp6 = tmp5.to(tl.float32) | |
tmp8 = tmp6 - tmp7 | |
tmp9 = tmp3 * tmp8 | |
_tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10) | |
tmp4 = tl.sum(_tmp4, 1)[:, None] | |
tl.store(out_ptr0 + x0, tmp4, xmask) | |
tmp10 = tl.sum(_tmp10, 1)[:, None] | |
tl.store(out_ptr1 + x0, tmp10, xmask) | |
tmp11 = tl.load(in_ptr4 + (x0), xmask) | |
tmp12 = tmp10 * tmp11 | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp12, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_103.run(*args, 624, 25088, grid=grid(624), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_103.benchmark_all_configs(*args, 624, 25088, grid=grid(624)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/n6/cn6vgxz3swb2xjokxek7wjtmjhdz6hdzplejuatnteeyvplenl3a.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.convolution_backward, aten.mul, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_216 | |
# aten.convolution_backward => convolution_backward_70 | |
# aten.mul => mul_789 | |
# aten.native_batch_norm_backward => convert_element_type_586, convert_element_type_588, mul_796, mul_797, sub_186, sub_188, sub_189 | |
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_104 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_104(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 15654912 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 196) % 624 | |
tmp0 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp4 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
tmp6 = tl.load(in_ptr2 + (x1), None) | |
tmp8 = tl.load(in_ptr3 + (x1), None) | |
tmp11 = tl.load(in_ptr4 + (x1), None) | |
tmp16 = tl.load(in_ptr5 + (x1), None) | |
tmp19 = tl.load(in_ptr6 + (x1), None) | |
tmp2 = tmp0 * tmp1 | |
tmp3 = tmp2.to(tl.float32) | |
tmp5 = tmp4.to(tl.float32) | |
tmp7 = tmp5 - tmp6 | |
tmp9 = 3.985969387755102e-05 | |
tmp10 = tmp8 * tmp9 | |
tmp12 = tmp11 * tmp11 | |
tmp13 = tmp10 * tmp12 | |
tmp14 = tmp7 * tmp13 | |
tmp15 = tmp3 - tmp14 | |
tmp17 = tmp16 * tmp9 | |
tmp18 = tmp15 - tmp17 | |
tmp20 = tmp11 * tmp19 | |
tmp21 = tmp18 * tmp20 | |
tmp22 = tmp21.to(tl.float32) | |
tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_104.run(*args, 15654912, grid=grid(15654912), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_104.benchmark_all_configs(*args, 15654912, grid=grid(15654912)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/iv/civ2oft43774ffyg3v5kh23rmteh5qvbuafa2jrfla6exeuvk3sq.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_589 | |
triton_poi_fused__to_copy_105 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_105(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 64896 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((624, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((624, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_105.run(*args, 64896, grid=grid(64896), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_105.benchmark_all_configs(*args, 64896, grid=grid(64896)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/j7/cj7ui4cmg7zcaen4xj4vc2j54wf3j5vlszxmshcee37dzcpqxdcd.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_213 | |
# aten.native_batch_norm_backward => convert_element_type_590, mul_799, sub_190, sum_76, sum_77 | |
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_106 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[512, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_106(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 416 | |
rnumel = 6272 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex % 104 | |
x1 = (xindex // 104) | |
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
tmp5 = tl.load(in_ptr2 + (x0), xmask) | |
_tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = tl.load(in_ptr0 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp3 = tl.load(in_ptr1 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
tmp4 = tmp3.to(tl.float32) | |
tmp6 = tmp4 - tmp5 | |
tmp7 = tmp1 * tmp6 | |
_tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8) | |
tmp2 = tl.sum(_tmp2, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp2, xmask) | |
tmp8 = tl.sum(_tmp8, 1)[:, None] | |
tl.store(out_ptr1 + x3, tmp8, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_106.run(*args, 416, 6272, grid=grid(416), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_106.benchmark_all_configs(*args, 416, 6272, grid=grid(416)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/at/catlmgmveca6eegfxgggc25rzm3umawxz52w4d3665iwc5nkqdhv.py | |
# Original ATen: aten.native_batch_norm_backward | |
# aten.native_batch_norm_backward => convert_element_type_590, sum_76 | |
triton_per_fused_native_batch_norm_backward_107 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[128, 4], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused_native_batch_norm_backward_107(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 104 | |
rnumel = 4 | |
RBLOCK: tl.constexpr = 4 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (104*r1)), rmask & xmask, other=0) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tl.store(out_ptr0 + x0, tmp3, xmask) | |
def get_args(): | |
arg_0 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused_native_batch_norm_backward_107.run(*args, 104, 4, grid=grid(104), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused_native_batch_norm_backward_107.benchmark_all_configs(*args, 104, 4, grid=grid(104)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/sw/cswvpkxc2jcvqnphnzcluqix5sdpoevu3y2llapfjncqthtvmuif.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_213 | |
# aten.native_batch_norm_backward => convert_element_type_590, mul_799, mul_807, sub_190, sum_77 | |
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_108 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[128, 4], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_108(in_ptr0, in_ptr1, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 104 | |
rnumel = 4 | |
RBLOCK: tl.constexpr = 4 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (104*r1)), rmask & xmask, other=0) | |
tmp4 = tl.load(in_ptr1 + (x0), xmask) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp5 = tmp3 * tmp4 | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask) | |
tl.store(out_ptr0 + x0, tmp3, xmask) | |
def get_args(): | |
arg_0 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_108.run(*args, 104, 4, grid=grid(104), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_108.benchmark_all_configs(*args, 104, 4, grid=grid(104)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/6d/c6dicjfqwciclb5imlva4adfrsslgvcv4kzchyl4zcmedltgvjfk.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_213 | |
# aten.native_batch_norm_backward => convert_element_type_590, convert_element_type_592, mul_805, mul_806, sub_190, sub_192, sub_193 | |
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_109 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp16', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_109(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2609152 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 196) % 104 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
tmp4 = tl.load(in_ptr2 + (x1), None) | |
tmp6 = tl.load(in_ptr3 + (x1), None) | |
tmp9 = tl.load(in_ptr4 + (x1), None) | |
tmp14 = tl.load(in_ptr5 + (x1), None) | |
tmp17 = tl.load(in_ptr6 + (x1), None) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp2.to(tl.float32) | |
tmp5 = tmp3 - tmp4 | |
tmp7 = 3.985969387755102e-05 | |
tmp8 = tmp6 * tmp7 | |
tmp10 = tmp9 * tmp9 | |
tmp11 = tmp8 * tmp10 | |
tmp12 = tmp5 * tmp11 | |
tmp13 = tmp1 - tmp12 | |
tmp15 = tmp14 * tmp7 | |
tmp16 = tmp13 - tmp15 | |
tmp18 = tmp9 * tmp17 | |
tmp19 = tmp16 * tmp18 | |
tmp20 = tmp19.to(tl.float32) | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp20, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_109.run(*args, 2609152, grid=grid(2609152), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_109.benchmark_all_configs(*args, 2609152, grid=grid(2609152)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/u6/cu6gj4lj3w4o4rbwgegqtcfzcryyabv5bfn4fymj6h576kucjrrk.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_593 | |
triton_poi_fused__to_copy_110 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_110(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 16224 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_110.run(*args, 16224, grid=grid(16224), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_110.benchmark_all_configs(*args, 16224, grid=grid(16224)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/cq/ccq5bqvz5gysfuffc2onzn5q2itdz5cceokvaxh3r7x3536x7ffr.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_57 | |
triton_poi_fused_cat_111 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_111(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 7827456 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 61152 | |
x1 = (xindex // 61152) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (122304*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 312, 14, 14), (61152, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 312, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_111.run(*args, 7827456, grid=grid(7827456), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_111.benchmark_all_configs(*args, 7827456, grid=grid(7827456)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/wo/cwolgc4dw6izbizjkqhmxp5s3e2v7ufjibf3admpysa3tcfcvbwi.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_598 | |
triton_poi_fused__to_copy_112 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_112(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 16224 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((624, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((624, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_112.run(*args, 16224, grid=grid(16224), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_112.benchmark_all_configs(*args, 16224, grid=grid(16224)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/zs/czsqkpgzlfpo7vf22txwb4vktp6li35xdtdplz23vaooelx52mgz.py | |
# Original ATen: aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub | |
# aten.add => add_342 | |
# aten.clone => clone_23 | |
# aten.fill => full_like_24 | |
# aten.mul => mul_812, mul_813, mul_814 | |
# aten.sigmoid => sigmoid_88 | |
# aten.sub => sub_195 | |
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_113 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_add_clone_fill_mul_sigmoid_sub_113(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3328 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = 1.0 | |
tmp4 = tmp3 - tmp2 | |
tmp5 = tmp1 * tmp4 | |
tmp6 = tmp5 + tmp3 | |
tmp7 = tmp2 * tmp6 | |
tmp8 = tmp0 * tmp7 | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp8, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_113.run(*args, 3328, grid=grid(3328), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_add_clone_fill_mul_sigmoid_sub_113.benchmark_all_configs(*args, 3328, grid=grid(3328)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/br/cbrdby4wbj6lkywp7ihnm6qs7bmx7qo2e45i7oxtqq4xbl3oog4o.py | |
# Original ATen: aten._to_copy, aten.convolution_backward | |
# aten._to_copy => convert_element_type_601 | |
# aten.convolution_backward => sum_80 | |
triton_per_fused__to_copy_convolution_backward_114 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[32, 128], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 3), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__to_copy_convolution_backward_114(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 26 | |
rnumel = 128 | |
RBLOCK: tl.constexpr = 128 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (26*r1)), rmask & xmask, other=0).to(tl.float32) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((26,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__to_copy_convolution_backward_114.run(*args, 26, 128, grid=grid(26), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__to_copy_convolution_backward_114.benchmark_all_configs(*args, 26, 128, grid=grid(26)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/4u/c4uahgnz2ykwhxsj7bueioargms6lhe32ffh6vzmxhridx2iurdm.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_600 | |
triton_poi_fused__to_copy_115 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_115(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 16224 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((26, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((26, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_115.run(*args, 16224, grid=grid(16224), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_115.benchmark_all_configs(*args, 16224, grid=grid(16224)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ke/cke6ikrfgpb6sjssa2vwqbbb346zboallywrrkyxxcehakhefz5y.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub | |
# aten._native_batch_norm_legit_functional => convert_element_type_201 | |
# aten.add => add_343, add_344 | |
# aten.clone => clone_22 | |
# aten.div => div_9 | |
# aten.fill => full_like_25 | |
# aten.mul => mul_809, mul_815, mul_816, mul_817 | |
# aten.native_batch_norm_backward => convert_element_type_602, mul_824, sub_197, sub_199, sub_200 | |
# aten.sigmoid => sigmoid_31, sigmoid_89 | |
# aten.sub => sub_196 | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_116 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_116(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 15654912 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x4 = (xindex // 196) | |
x1 = (xindex // 196) % 624 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32) | |
tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32) | |
tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32) | |
tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32) | |
tmp19 = tl.load(in_ptr5 + (x1), None) | |
tmp21 = tl.load(in_ptr6 + (x1), None) | |
tmp24 = tl.load(in_ptr7 + (x1), None) | |
tmp29 = tl.load(in_ptr8 + (x1), None) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp0 * tmp2 | |
tmp5 = 196.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = tmp3 + tmp6 | |
tmp9 = tl.sigmoid(tmp8) | |
tmp10 = 1.0 | |
tmp11 = tmp10 - tmp9 | |
tmp12 = tmp8 * tmp11 | |
tmp13 = tmp12 + tmp10 | |
tmp14 = tmp9 * tmp13 | |
tmp15 = tmp7 * tmp14 | |
tmp16 = tmp15.to(tl.float32) | |
tmp18 = tmp17.to(tl.float32) | |
tmp20 = tmp18 - tmp19 | |
tmp22 = 3.985969387755102e-05 | |
tmp23 = tmp21 * tmp22 | |
tmp25 = tmp24 * tmp24 | |
tmp26 = tmp23 * tmp25 | |
tmp27 = tmp20 * tmp26 | |
tmp28 = tmp16 - tmp27 | |
tmp30 = tmp29 * tmp22 | |
tmp31 = tmp28 - tmp30 | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp31, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_5 = rand_strided((1, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_9 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_116.run(*args, 15654912, grid=grid(15654912), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_116.benchmark_all_configs(*args, 15654912, grid=grid(15654912)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/v7/cv7sl67sz2xaf4l6bv2yby2sj22h2574hqzu7lld6chl75bgzok7.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_75 | |
triton_poi_fused_convolution_backward_117 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_117(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3913728 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = (xindex // 30576) | |
x3 = xindex % 30576 | |
x1 = (xindex // 196) % 156 | |
x4 = xindex | |
tmp0 = tl.load(in_ptr0 + (91728 + x3 + (122304*x2)), None) | |
tmp1 = tl.load(in_ptr1 + (468 + x1), None) | |
tmp2 = tl.load(in_ptr2 + (468 + x1), None) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp0 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_117.run(*args, 3913728, grid=grid(3913728), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_117.benchmark_all_configs(*args, 3913728, grid=grid(3913728)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/sn/csnxk2aehntwbtuxvtyjfxxx5pcnnxto57kxv3m57bgn74rxyzdo.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_605 | |
triton_poi_fused__to_copy_118 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_118(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 12636 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((156, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((156, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_118.run(*args, 12636, grid=grid(12636), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_118.benchmark_all_configs(*args, 12636, grid=grid(12636)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/b2/cb2zxjnunwnxsrlbgr5rnqihgpyuvgs44v74hreykirhvteg3imr.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_76 | |
triton_poi_fused_convolution_backward_119 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_119(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3913728 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = (xindex // 30576) | |
x3 = xindex % 30576 | |
x1 = (xindex // 196) % 156 | |
x4 = xindex | |
tmp0 = tl.load(in_ptr0 + (61152 + x3 + (122304*x2)), None) | |
tmp1 = tl.load(in_ptr1 + (312 + x1), None) | |
tmp2 = tl.load(in_ptr2 + (312 + x1), None) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp0 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_119.run(*args, 3913728, grid=grid(3913728), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_119.benchmark_all_configs(*args, 3913728, grid=grid(3913728)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/mq/cmqf6tn342yiwkdctvgld7rocxgxgv3azqztra5gqfjonbr5lfkj.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_606 | |
triton_poi_fused__to_copy_120 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_120(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 7644 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((156, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((156, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_120.run(*args, 7644, grid=grid(7644), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_120.benchmark_all_configs(*args, 7644, grid=grid(7644)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/c6/cc6x33tgwrw2tamqam7mfhx5ek6rlk7b3nnl7psxxxxvdr23nguf.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_77 | |
triton_poi_fused_convolution_backward_121 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_121(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3913728 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = (xindex // 30576) | |
x3 = xindex % 30576 | |
x1 = (xindex // 196) % 156 | |
x4 = xindex | |
tmp0 = tl.load(in_ptr0 + (30576 + x3 + (122304*x2)), None) | |
tmp1 = tl.load(in_ptr1 + (156 + x1), None) | |
tmp2 = tl.load(in_ptr2 + (156 + x1), None) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp0 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_121.run(*args, 3913728, grid=grid(3913728), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_121.benchmark_all_configs(*args, 3913728, grid=grid(3913728)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/bo/cbod67uuatalyrpqcdu3oy7sbucan6ecl2ia4zk35wtdojhhxonj.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_607 | |
triton_poi_fused__to_copy_122 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_122(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3900 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((156, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((156, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_122.run(*args, 3900, grid=grid(3900), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_122.benchmark_all_configs(*args, 3900, grid=grid(3900)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/rc/crclip5jefet3ymndwzega4qujbberu5gvtz4mjx4deqjbqd4sg3.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_78 | |
triton_poi_fused_convolution_backward_123 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_123(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3913728 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = (xindex // 30576) | |
x3 = xindex % 30576 | |
x1 = (xindex // 196) % 156 | |
x4 = xindex | |
tmp0 = tl.load(in_ptr0 + (x3 + (122304*x2)), None) | |
tmp1 = tl.load(in_ptr1 + (x1), None) | |
tmp2 = tl.load(in_ptr2 + (x1), None) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp0 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_123.run(*args, 3913728, grid=grid(3913728), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_123.benchmark_all_configs(*args, 3913728, grid=grid(3913728)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/eu/ceukfvbcwohfoz2lavfjfb2ycp454er54mrt4pq2hnohxkqujrr3.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_608 | |
triton_poi_fused__to_copy_124 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_124(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1404 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((156, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((156, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_124.run(*args, 1404, grid=grid(1404), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_124.benchmark_all_configs(*args, 1404, grid=grid(1404)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/mj/cmjohiwxqtsa56w5lbbo3aubkq7zy3joom3fttzn7hijt3v2ims3.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_58 | |
triton_poi_fused_cat_125 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_125(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3913728 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 30576 | |
x1 = (xindex // 30576) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (122304*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 156, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_125.run(*args, 3913728, grid=grid(3913728), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_125.benchmark_all_configs(*args, 3913728, grid=grid(3913728)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/rn/crn2e5wpsvghyj2fewhvokcjgyf5y4tcls4p5a3zgbxt3jo63qp5.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_612 | |
triton_poi_fused__to_copy_126 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_126(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 16224 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_126.run(*args, 16224, grid=grid(16224), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_126.benchmark_all_configs(*args, 16224, grid=grid(16224)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/if/cifaxtccprphcf3f7mkgsgusdq7va54imzb4ozr5t5rqz26r4xz3.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_59 | |
triton_poi_fused_cat_127 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_127(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1304576 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 10192 | |
x1 = (xindex // 10192) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (20384*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 52, 14, 14), (10192, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 52, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_127.run(*args, 1304576, grid=grid(1304576), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_127.benchmark_all_configs(*args, 1304576, grid=grid(1304576)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/qf/cqfc2p7t5thx2bnsyx6ihjsqv65lizzzkgfndnkunfne6ahlxrbx.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_189 | |
# aten.add => add_346 | |
# aten.native_batch_norm_backward => convert_element_type_614, mul_839, sub_206, sum_85, sum_86 | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_128 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[512, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: 'i32', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_128(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 416 | |
rnumel = 6272 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex % 104 | |
x1 = (xindex // 104) | |
_tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
tmp7 = tl.load(in_ptr3 + (x0), xmask) | |
_tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = tl.load(in_ptr0 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp5 = tl.load(in_ptr2 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp2 = tmp0 + tmp1 | |
tmp3 = tmp2.to(tl.float32) | |
_tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4) | |
tmp6 = tmp5.to(tl.float32) | |
tmp8 = tmp6 - tmp7 | |
tmp9 = tmp3 * tmp8 | |
_tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10) | |
tmp4 = tl.sum(_tmp4, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp4, xmask) | |
tmp10 = tl.sum(_tmp10, 1)[:, None] | |
tl.store(out_ptr1 + x3, tmp10, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_128.run(*args, 416, 6272, grid=grid(416), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_128.benchmark_all_configs(*args, 416, 6272, grid=grid(416)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/55/c55duaucwd5mqi2jcpis5psliz6udgc3zunl4l6gsysumyukuodb.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_189 | |
# aten.add => add_346 | |
# aten.native_batch_norm_backward => convert_element_type_614, convert_element_type_616, mul_845, mul_846, sub_206, sub_208, sub_209 | |
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_129 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp16', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_129(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2609152 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 196) % 104 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
tmp4 = tl.load(in_ptr2 + (x3), None).to(tl.float32) | |
tmp6 = tl.load(in_ptr3 + (x1), None) | |
tmp8 = tl.load(in_ptr4 + (x1), None) | |
tmp11 = tl.load(in_ptr5 + (x1), None) | |
tmp16 = tl.load(in_ptr6 + (x1), None) | |
tmp19 = tl.load(in_ptr7 + (x1), None) | |
tmp2 = tmp0 + tmp1 | |
tmp3 = tmp2.to(tl.float32) | |
tmp5 = tmp4.to(tl.float32) | |
tmp7 = tmp5 - tmp6 | |
tmp9 = 3.985969387755102e-05 | |
tmp10 = tmp8 * tmp9 | |
tmp12 = tmp11 * tmp11 | |
tmp13 = tmp10 * tmp12 | |
tmp14 = tmp7 * tmp13 | |
tmp15 = tmp3 - tmp14 | |
tmp17 = tmp16 * tmp9 | |
tmp18 = tmp15 - tmp17 | |
tmp20 = tmp11 * tmp19 | |
tmp21 = tmp18 * tmp20 | |
tmp22 = tmp21.to(tl.float32) | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_129.run(*args, 2609152, grid=grid(2609152), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_129.benchmark_all_configs(*args, 2609152, grid=grid(2609152)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/gi/cgidg4l74j3mxppzfsjtpi42hq4ihs47f6epftki5i5ywvfqu6wv.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_165 | |
# aten.add => add_346, add_351 | |
# aten.native_batch_norm_backward => convert_element_type_638, mul_879, sub_222, sum_94, sum_95 | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_130 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[512, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_130(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 416 | |
rnumel = 6272 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex % 104 | |
x1 = (xindex // 104) | |
_tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
tmp9 = tl.load(in_ptr4 + (x0), xmask) | |
_tmp12 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = tl.load(in_ptr0 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp3 = tl.load(in_ptr2 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp7 = tl.load(in_ptr3 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp2 = tmp0 + tmp1 | |
tmp4 = tmp2 + tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
_tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6) | |
tmp8 = tmp7.to(tl.float32) | |
tmp10 = tmp8 - tmp9 | |
tmp11 = tmp5 * tmp10 | |
_tmp12 = tl.where(rmask & xmask, _tmp12 + tmp11, _tmp12) | |
tmp6 = tl.sum(_tmp6, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp6, xmask) | |
tmp12 = tl.sum(_tmp12, 1)[:, None] | |
tl.store(out_ptr1 + x3, tmp12, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_130.run(*args, 416, 6272, grid=grid(416), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_130.benchmark_all_configs(*args, 416, 6272, grid=grid(416)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/nf/cnfybinrqgwpzwkognoekbtqzfldbno4q6jduhaywjxls5lwqbsd.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_165 | |
# aten.add => add_346, add_351 | |
# aten.native_batch_norm_backward => convert_element_type_638, mul_885, mul_886, sub_222, sub_224, sub_225 | |
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_131 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_131(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2609152 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 196) % 104 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
tmp3 = tl.load(in_ptr2 + (x3), None).to(tl.float32) | |
tmp6 = tl.load(in_ptr3 + (x3), None).to(tl.float32) | |
tmp8 = tl.load(in_ptr4 + (x1), None) | |
tmp10 = tl.load(in_ptr5 + (x1), None) | |
tmp13 = tl.load(in_ptr6 + (x1), None) | |
tmp18 = tl.load(in_ptr7 + (x1), None) | |
tmp21 = tl.load(in_ptr8 + (x1), None) | |
tmp2 = tmp0 + tmp1 | |
tmp4 = tmp2 + tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tmp7 = tmp6.to(tl.float32) | |
tmp9 = tmp7 - tmp8 | |
tmp11 = 3.985969387755102e-05 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp13 * tmp13 | |
tmp15 = tmp12 * tmp14 | |
tmp16 = tmp9 * tmp15 | |
tmp17 = tmp5 - tmp16 | |
tmp19 = tmp18 * tmp11 | |
tmp20 = tmp17 - tmp19 | |
tmp22 = tmp13 * tmp21 | |
tmp23 = tmp20 * tmp22 | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp23, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_9 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_131.run(*args, 2609152, grid=grid(2609152), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_131.benchmark_all_configs(*args, 2609152, grid=grid(2609152)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/e6/ce6moc5iqgt57e7eqjuf5zhxhh5tsl3aixs4uw2xn3bhmzxi4yko.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_91 | |
triton_poi_fused_convolution_backward_132 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_132(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1304576 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 10192 | |
x1 = (xindex // 10192) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (10192 + x0 + (20384*x1)), None) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((128, 52, 14, 14), (10192, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_132.run(*args, 1304576, grid=grid(1304576), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_132.benchmark_all_configs(*args, 1304576, grid=grid(1304576)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/dk/cdkheqxbj4lk4wbn4mfiw2m45ufmxv76zkupgnoqujmqz7cf3flk.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_92 | |
triton_poi_fused_convolution_backward_133 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_133(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1304576 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 10192 | |
x1 = (xindex // 10192) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (20384*x1)), None) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((128, 52, 14, 14), (10192, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_133.run(*args, 1304576, grid=grid(1304576), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_133.benchmark_all_configs(*args, 1304576, grid=grid(1304576)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/w3/cw3zbb3x7kiu4ifygfhy3eaz6tyephtwgkxcx7rkmsrsvtrbrcix.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_141 | |
# aten.add => add_346, add_351, add_356 | |
# aten.native_batch_norm_backward => convert_element_type_662, mul_919, sub_238, sum_103, sum_104 | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_134 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[512, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_134(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 416 | |
rnumel = 6272 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex % 104 | |
x1 = (xindex // 104) | |
_tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
tmp11 = tl.load(in_ptr5 + (x0), xmask) | |
_tmp14 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = tl.load(in_ptr0 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp3 = tl.load(in_ptr2 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp5 = tl.load(in_ptr3 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp9 = tl.load(in_ptr4 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp2 = tmp0 + tmp1 | |
tmp4 = tmp2 + tmp3 | |
tmp6 = tmp4 + tmp5 | |
tmp7 = tmp6.to(tl.float32) | |
_tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8) | |
tmp10 = tmp9.to(tl.float32) | |
tmp12 = tmp10 - tmp11 | |
tmp13 = tmp7 * tmp12 | |
_tmp14 = tl.where(rmask & xmask, _tmp14 + tmp13, _tmp14) | |
tmp8 = tl.sum(_tmp8, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp8, xmask) | |
tmp14 = tl.sum(_tmp14, 1)[:, None] | |
tl.store(out_ptr1 + x3, tmp14, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_5 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_134.run(*args, 416, 6272, grid=grid(416), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_134.benchmark_all_configs(*args, 416, 6272, grid=grid(416)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ej/cejoyk2b6v7darhrxgzp23ovzotirkownmjt7wscn7uwlctg5oxk.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.convolution_backward, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_141 | |
# aten.add => add_346, add_351, add_356 | |
# aten.convolution_backward => convolution_backward_101 | |
# aten.native_batch_norm_backward => convert_element_type_662, convert_element_type_664, mul_925, mul_926, sub_238, sub_240, sub_241 | |
triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_135 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: '*fp16', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_135(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, in_ptr9, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2609152 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 196) % 104 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
tmp3 = tl.load(in_ptr2 + (x3), None).to(tl.float32) | |
tmp5 = tl.load(in_ptr3 + (x3), None).to(tl.float32) | |
tmp8 = tl.load(in_ptr4 + (x3), None).to(tl.float32) | |
tmp10 = tl.load(in_ptr5 + (x1), None) | |
tmp12 = tl.load(in_ptr6 + (x1), None) | |
tmp15 = tl.load(in_ptr7 + (x1), None) | |
tmp20 = tl.load(in_ptr8 + (x1), None) | |
tmp23 = tl.load(in_ptr9 + (x1), None) | |
tmp2 = tmp0 + tmp1 | |
tmp4 = tmp2 + tmp3 | |
tmp6 = tmp4 + tmp5 | |
tmp7 = tmp6.to(tl.float32) | |
tmp9 = tmp8.to(tl.float32) | |
tmp11 = tmp9 - tmp10 | |
tmp13 = 3.985969387755102e-05 | |
tmp14 = tmp12 * tmp13 | |
tmp16 = tmp15 * tmp15 | |
tmp17 = tmp14 * tmp16 | |
tmp18 = tmp11 * tmp17 | |
tmp19 = tmp7 - tmp18 | |
tmp21 = tmp20 * tmp13 | |
tmp22 = tmp19 - tmp21 | |
tmp24 = tmp15 * tmp23 | |
tmp25 = tmp22 * tmp24 | |
tmp26 = tmp25.to(tl.float32) | |
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp26, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_5 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_9 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_10 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, arg_10, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_135.run(*args, 2609152, grid=grid(2609152), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_135.benchmark_all_configs(*args, 2609152, grid=grid(2609152)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/5h/c5hpdahtkfn4l3dhqtwm7jlr542h6muhvdgkqrxb44tec4smg4fm.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_665 | |
triton_poi_fused__to_copy_136 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_136(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 34944 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((104, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((104, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_136.run(*args, 34944, grid=grid(34944), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_136.benchmark_all_configs(*args, 34944, grid=grid(34944)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/bc/cbch6pezjexqee4mvwdnar33u6mvs47umwhwas6d2blv7733qelg.py | |
# Original ATen: aten.mul, aten.sigmoid, aten.sigmoid_backward, aten.silu, aten.sum | |
# aten.mul => mul_928 | |
# aten.sigmoid => sigmoid_19 | |
# aten.sigmoid_backward => convert_element_type_666, convert_element_type_667, convert_element_type_668, mul_930, mul_931, sub_242 | |
# aten.silu => convert_element_type_132, convert_element_type_133, mul_178, sigmoid_17 | |
# aten.sum => sum_105 | |
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_137 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[65536, 256], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_137(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 43008 | |
rnumel = 196 | |
RBLOCK: tl.constexpr = 256 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0)), rmask, other=0).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (r1 + (196*x0)), rmask, other=0).to(tl.float32) | |
tmp11 = tl.load(in_ptr2 + (x0), None).to(tl.float32) | |
tmp2 = tmp1.to(tl.float32) | |
tmp3 = tl.sigmoid(tmp2) | |
tmp4 = tmp2 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tmp6 = tmp0 * tmp5 | |
tmp8 = tl.where(rmask, tmp6, 0) | |
tmp9 = tl.sum(tmp8, 1)[:, None] | |
tmp10 = tmp9.to(tl.float32) | |
tmp12 = tl.sigmoid(tmp11) | |
tmp13 = tmp12.to(tl.float32) | |
tmp14 = 1.0 | |
tmp15 = tmp14 - tmp13 | |
tmp16 = tmp13 * tmp15 | |
tmp17 = tmp10 * tmp16 | |
tmp18 = tmp17.to(tl.float32) | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp18, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_137.run(*args, 43008, 196, grid=grid(43008), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_137.benchmark_all_configs(*args, 43008, 196, grid=grid(43008)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/lw/clwggjp2glb6lt7ovt2lc3pzpsocchmesscnc7d3a6zsnanmt43m.py | |
# Original ATen: aten._to_copy, aten.convolution_backward | |
# aten._to_copy => convert_element_type_670 | |
# aten.convolution_backward => sum_106 | |
triton_per_fused__to_copy_convolution_backward_138 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[512, 128], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__to_copy_convolution_backward_138(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 336 | |
rnumel = 128 | |
RBLOCK: tl.constexpr = 128 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (336*r1)), rmask & xmask, other=0).to(tl.float32) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__to_copy_convolution_backward_138.run(*args, 336, 128, grid=grid(336), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__to_copy_convolution_backward_138.benchmark_all_configs(*args, 336, 128, grid=grid(336)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ny/cnyioznughbctzhfqmivqbazouomcdvadukxtk2rxq6q22lk25nc.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_669 | |
triton_poi_fused__to_copy_139 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_139(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 4704 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((336, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((336, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_139.run(*args, 4704, grid=grid(4704), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_139.benchmark_all_configs(*args, 4704, grid=grid(4704)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/xy/cxy6onpt27zwbftqiiz3rmsdtwpb6ifqj55hnehr2kljkwmm2una.py | |
# Original ATen: aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub | |
# aten.add => add_357 | |
# aten.clone => clone_14 | |
# aten.fill => full_like_33 | |
# aten.mul => mul_932, mul_933, mul_934 | |
# aten.sigmoid => sigmoid_97 | |
# aten.sub => sub_243 | |
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_140 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_add_clone_fill_mul_sigmoid_sub_140(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1792 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = 1.0 | |
tmp4 = tmp3 - tmp2 | |
tmp5 = tmp1 * tmp4 | |
tmp6 = tmp5 + tmp3 | |
tmp7 = tmp2 * tmp6 | |
tmp8 = tmp0 * tmp7 | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp8, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_140.run(*args, 1792, grid=grid(1792), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_add_clone_fill_mul_sigmoid_sub_140.benchmark_all_configs(*args, 1792, grid=grid(1792)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/af/cafrvpfoaadhdlxhldphg5leeyz2enwfb64ifixaudfchrcwkgns.py | |
# Original ATen: aten._to_copy, aten.convolution_backward | |
# aten._to_copy => convert_element_type_672 | |
# aten.convolution_backward => sum_107 | |
triton_per_fused__to_copy_convolution_backward_141 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[16, 128], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 3), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__to_copy_convolution_backward_141(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 14 | |
rnumel = 128 | |
RBLOCK: tl.constexpr = 128 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (14*r1)), rmask & xmask, other=0).to(tl.float32) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((14,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__to_copy_convolution_backward_141.run(*args, 14, 128, grid=grid(14), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__to_copy_convolution_backward_141.benchmark_all_configs(*args, 14, 128, grid=grid(14)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ub/cubt2wopxldea56edxrzorl54nnvgwjendwqgv23wj6it7xjbynx.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_671 | |
triton_poi_fused__to_copy_142 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_142(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 4704 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((14, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((14, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_142.run(*args, 4704, grid=grid(4704), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_142.benchmark_all_configs(*args, 4704, grid=grid(4704)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/pw/cpwcwf6uhmujsg7jk3lvfx67kggnjin7oj2locjyrnecqe2jebap.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub | |
# aten._native_batch_norm_legit_functional => convert_element_type_130 | |
# aten.add => add_358, add_359 | |
# aten.clone => clone_13 | |
# aten.div => div_12 | |
# aten.fill => full_like_34 | |
# aten.mul => mul_929, mul_935, mul_936, mul_937 | |
# aten.native_batch_norm_backward => convert_element_type_673, mul_938, mul_946, sub_245, sum_108, sum_109 | |
# aten.sigmoid => sigmoid_19, sigmoid_98 | |
# aten.sub => sub_244 | |
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_143 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[512, 32768], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_143(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 336 | |
rnumel = 25088 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
tmp20 = tl.load(in_ptr5 + (x0), xmask) | |
_tmp23 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 196 | |
r2 = (rindex // 196) | |
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (65856*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (x0 + (336*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp4 = tl.load(in_ptr2 + (x0 + (336*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp8 = tl.load(in_ptr3 + (r1 + (196*x0) + (65856*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp18 = tl.load(in_ptr4 + (r1 + (196*x0) + (65856*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp0 * tmp2 | |
tmp5 = 196.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = tmp3 + tmp6 | |
tmp9 = tl.sigmoid(tmp8) | |
tmp10 = 1.0 | |
tmp11 = tmp10 - tmp9 | |
tmp12 = tmp8 * tmp11 | |
tmp13 = tmp12 + tmp10 | |
tmp14 = tmp9 * tmp13 | |
tmp15 = tmp7 * tmp14 | |
tmp16 = tmp15.to(tl.float32) | |
_tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17) | |
tmp19 = tmp18.to(tl.float32) | |
tmp21 = tmp19 - tmp20 | |
tmp22 = tmp16 * tmp21 | |
_tmp23 = tl.where(rmask & xmask, _tmp23 + tmp22, _tmp23) | |
tmp17 = tl.sum(_tmp17, 1)[:, None] | |
tl.store(out_ptr0 + x0, tmp17, xmask) | |
tmp23 = tl.sum(_tmp23, 1)[:, None] | |
tl.store(out_ptr1 + x0, tmp23, xmask) | |
tmp24 = tl.load(in_ptr6 + (x0), xmask) | |
tmp25 = tmp23 * tmp24 | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp25, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_5 = rand_strided((1, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_9 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_143.run(*args, 336, 25088, grid=grid(336), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_143.benchmark_all_configs(*args, 336, 25088, grid=grid(336)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/7p/c7psulr34da3bohr63ebglp7ja7h35fmmewyuanlwy56v67tyr3o.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub | |
# aten._native_batch_norm_legit_functional => convert_element_type_130 | |
# aten.add => add_358, add_359 | |
# aten.clone => clone_13 | |
# aten.div => div_12 | |
# aten.fill => full_like_34 | |
# aten.mul => mul_929, mul_935, mul_936, mul_937 | |
# aten.native_batch_norm_backward => convert_element_type_673, mul_944, sub_245, sub_247, sub_248 | |
# aten.sigmoid => sigmoid_19, sigmoid_98 | |
# aten.sub => sub_244 | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_144 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_144(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 8429568 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x4 = (xindex // 196) | |
x1 = (xindex // 196) % 336 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32) | |
tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32) | |
tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32) | |
tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32) | |
tmp19 = tl.load(in_ptr5 + (x1), None) | |
tmp21 = tl.load(in_ptr6 + (x1), None) | |
tmp24 = tl.load(in_ptr7 + (x1), None) | |
tmp29 = tl.load(in_ptr8 + (x1), None) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp0 * tmp2 | |
tmp5 = 196.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = tmp3 + tmp6 | |
tmp9 = tl.sigmoid(tmp8) | |
tmp10 = 1.0 | |
tmp11 = tmp10 - tmp9 | |
tmp12 = tmp8 * tmp11 | |
tmp13 = tmp12 + tmp10 | |
tmp14 = tmp9 * tmp13 | |
tmp15 = tmp7 * tmp14 | |
tmp16 = tmp15.to(tl.float32) | |
tmp18 = tmp17.to(tl.float32) | |
tmp20 = tmp18 - tmp19 | |
tmp22 = 3.985969387755102e-05 | |
tmp23 = tmp21 * tmp22 | |
tmp25 = tmp24 * tmp24 | |
tmp26 = tmp23 * tmp25 | |
tmp27 = tmp20 * tmp26 | |
tmp28 = tmp16 - tmp27 | |
tmp30 = tmp29 * tmp22 | |
tmp31 = tmp28 - tmp30 | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp31, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_5 = rand_strided((1, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_9 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_144.run(*args, 8429568, grid=grid(8429568), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_144.benchmark_all_configs(*args, 8429568, grid=grid(8429568)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/id/cidvqsl7tausuxdym44zowgtyle4qgcqwkmvhutx6ld6q33q6idp.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_104 | |
triton_poi_fused_convolution_backward_145 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_145(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2809856 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = (xindex // 21952) | |
x3 = xindex % 21952 | |
x1 = (xindex // 196) % 112 | |
x4 = xindex | |
tmp0 = tl.load(in_ptr0 + (43904 + x3 + (65856*x2)), None) | |
tmp1 = tl.load(in_ptr1 + (224 + x1), None) | |
tmp2 = tl.load(in_ptr2 + (224 + x1), None) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp0 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((128, 112, 14, 14), (21952, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_145.run(*args, 2809856, grid=grid(2809856), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_145.benchmark_all_configs(*args, 2809856, grid=grid(2809856)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/fd/cfdopkndut6mz6wq6bvbwuuulhnrmhdrv5rzyqs6re2kufkzwbyi.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_676 | |
triton_poi_fused__to_copy_146 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_146(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 5488 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((112, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((112, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_146.run(*args, 5488, grid=grid(5488), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_146.benchmark_all_configs(*args, 5488, grid=grid(5488)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/th/cthuea2iprfudqgjdcjb4l6mtcfp43ogq3yml625vwzkim7u4s2l.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_105 | |
triton_poi_fused_convolution_backward_147 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_147(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2809856 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = (xindex // 21952) | |
x3 = xindex % 21952 | |
x1 = (xindex // 196) % 112 | |
x4 = xindex | |
tmp0 = tl.load(in_ptr0 + (21952 + x3 + (65856*x2)), None) | |
tmp1 = tl.load(in_ptr1 + (112 + x1), None) | |
tmp2 = tl.load(in_ptr2 + (112 + x1), None) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp0 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((128, 112, 14, 14), (21952, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_147.run(*args, 2809856, grid=grid(2809856), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_147.benchmark_all_configs(*args, 2809856, grid=grid(2809856)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/3x/c3xs7plvafsbnfj7oqnbllamz4ftcowca37hxufdgln6b4kvegly.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_677 | |
triton_poi_fused__to_copy_148 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_148(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2800 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((112, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((112, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_148.run(*args, 2800, grid=grid(2800), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_148.benchmark_all_configs(*args, 2800, grid=grid(2800)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/zr/czrwimvnigaum3atz3dnqh3vpqefjj27el4vvkm52xud7pku2qdu.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_106 | |
triton_poi_fused_convolution_backward_149 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_149(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2809856 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = (xindex // 21952) | |
x3 = xindex % 21952 | |
x1 = (xindex // 196) % 112 | |
x4 = xindex | |
tmp0 = tl.load(in_ptr0 + (x3 + (65856*x2)), None) | |
tmp1 = tl.load(in_ptr1 + (x1), None) | |
tmp2 = tl.load(in_ptr2 + (x1), None) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp0 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((128, 112, 14, 14), (21952, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_149.run(*args, 2809856, grid=grid(2809856), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_149.benchmark_all_configs(*args, 2809856, grid=grid(2809856)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/2v/c2volwa6uwcetttnruu5jjo66mjrobvup6mcg6k7n4aknrx3iggt.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_678 | |
triton_poi_fused__to_copy_150 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_150(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1008 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((112, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((112, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_150.run(*args, 1008, grid=grid(1008), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_150.benchmark_all_configs(*args, 1008, grid=grid(1008)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/we/cwefssxf5bvvy2y5bt7gv2qzizpcib7seqbqn2voklitwa4hshmd.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_66 | |
triton_poi_fused_cat_151 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_151(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 11239424 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 87808 | |
x1 = (xindex // 87808) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (263424*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 112, 28, 28), (87808, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 112, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_151.run(*args, 11239424, grid=grid(11239424), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_151.benchmark_all_configs(*args, 11239424, grid=grid(11239424)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/qf/cqfo4ck6nf3trupktl4ixfnhezf4tcvnlhrfn63fekgvbdyrr6da.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.mul, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_123 | |
# aten.mul => mul_949 | |
# aten.native_batch_norm_backward => convert_element_type_679, mul_950, mul_958, sub_250, sum_110, sum_111 | |
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_152 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[512, 131072], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_152(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 336 | |
rnumel = 100352 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
tmp7 = tl.load(in_ptr3 + (x0), xmask) | |
_tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 784 | |
r2 = (rindex // 784) | |
tmp0 = tl.load(in_ptr0 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp5 = tl.load(in_ptr2 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp2 = tmp0 * tmp1 | |
tmp3 = tmp2.to(tl.float32) | |
_tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4) | |
tmp6 = tmp5.to(tl.float32) | |
tmp8 = tmp6 - tmp7 | |
tmp9 = tmp3 * tmp8 | |
_tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10) | |
tmp4 = tl.sum(_tmp4, 1)[:, None] | |
tl.store(out_ptr0 + x0, tmp4, xmask) | |
tmp10 = tl.sum(_tmp10, 1)[:, None] | |
tl.store(out_ptr1 + x0, tmp10, xmask) | |
tmp11 = tl.load(in_ptr4 + (x0), xmask) | |
tmp12 = tmp10 * tmp11 | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp12, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_152.run(*args, 336, 100352, grid=grid(336), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_152.benchmark_all_configs(*args, 336, 100352, grid=grid(336)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/qc/cqcwwcxya4t45mgfdz46w2lkm3l3dt6ws7mawrdyrpvgbmqxgblv.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.convolution_backward, aten.mul, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_123 | |
# aten.convolution_backward => convolution_backward_107 | |
# aten.mul => mul_949 | |
# aten.native_batch_norm_backward => convert_element_type_679, convert_element_type_681, mul_956, mul_957, sub_250, sub_252, sub_253 | |
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_153 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_153(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 33718272 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 784) % 336 | |
tmp0 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp4 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
tmp6 = tl.load(in_ptr2 + (x1), None) | |
tmp8 = tl.load(in_ptr3 + (x1), None) | |
tmp11 = tl.load(in_ptr4 + (x1), None) | |
tmp16 = tl.load(in_ptr5 + (x1), None) | |
tmp19 = tl.load(in_ptr6 + (x1), None) | |
tmp2 = tmp0 * tmp1 | |
tmp3 = tmp2.to(tl.float32) | |
tmp5 = tmp4.to(tl.float32) | |
tmp7 = tmp5 - tmp6 | |
tmp9 = 9.964923469387754e-06 | |
tmp10 = tmp8 * tmp9 | |
tmp12 = tmp11 * tmp11 | |
tmp13 = tmp10 * tmp12 | |
tmp14 = tmp7 * tmp13 | |
tmp15 = tmp3 - tmp14 | |
tmp17 = tmp16 * tmp9 | |
tmp18 = tmp15 - tmp17 | |
tmp20 = tmp11 * tmp19 | |
tmp21 = tmp18 * tmp20 | |
tmp22 = tmp21.to(tl.float32) | |
tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_153.run(*args, 33718272, grid=grid(33718272), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_153.benchmark_all_configs(*args, 33718272, grid=grid(33718272)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/xp/cxpyb2d7tgj3ajjcfk5ruyhc5u2uszkeh76gef4njgyf5kz5ukth.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_682 | |
triton_poi_fused__to_copy_154 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_154(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 18816 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((336, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((336, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_154.run(*args, 18816, grid=grid(18816), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_154.benchmark_all_configs(*args, 18816, grid=grid(18816)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ku/cku7ek7gtwfrqcc34nfk6i7wvwqtzc57tln743w3jpx2qsiyexs7.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_120 | |
# aten.native_batch_norm_backward => convert_element_type_683, mul_959, sub_254, sum_112, sum_113 | |
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_155 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[1024, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_155(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 728 | |
rnumel = 7720 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x1 = (xindex // 56) | |
x0 = xindex % 56 | |
_tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
_tmp13 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = r2 + (7720*x1) | |
tmp1 = 100352 | |
tmp2 = tmp0 < tmp1 | |
tmp3 = tl.load(in_ptr0 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp4 = tmp3.to(tl.float32) | |
tmp5 = tl.where(tmp2, tmp4, 0) | |
_tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6) | |
tmp7 = tl.load(in_ptr1 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp8 = tmp7.to(tl.float32) | |
tmp9 = tl.load(in_ptr2 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0) | |
tmp10 = tmp8 - tmp9 | |
tmp11 = tmp4 * tmp10 | |
tmp12 = tl.where(tmp2, tmp11, 0) | |
_tmp13 = tl.where(rmask & xmask, _tmp13 + tmp12, _tmp13) | |
tmp6 = tl.sum(_tmp6, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp6, xmask) | |
tmp13 = tl.sum(_tmp13, 1)[:, None] | |
tl.store(out_ptr1 + x3, tmp13, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((1, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_155.run(*args, 728, 7720, grid=grid(728), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_155.benchmark_all_configs(*args, 728, 7720, grid=grid(728)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/uy/cuyabiwct4xwna22y54u6ef6hgcysgudgvw53xgcxhoetvhdud3c.py | |
# Original ATen: aten.native_batch_norm_backward | |
# aten.native_batch_norm_backward => convert_element_type_683, sum_112 | |
triton_per_fused_native_batch_norm_backward_156 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[64, 16], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused_native_batch_norm_backward_156(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 56 | |
rnumel = 13 | |
RBLOCK: tl.constexpr = 16 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (56*r1)), rmask & xmask, other=0) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tl.store(out_ptr0 + x0, tmp3, xmask) | |
def get_args(): | |
arg_0 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused_native_batch_norm_backward_156.run(*args, 56, 13, grid=grid(56), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused_native_batch_norm_backward_156.benchmark_all_configs(*args, 56, 13, grid=grid(56)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/on/convc5p74ddudqk46e52onvmx3rxjfktebgzjuhxneits4eo7nyc.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_120 | |
# aten.native_batch_norm_backward => convert_element_type_683, mul_959, mul_967, sub_254, sum_113 | |
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_157 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[64, 16], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_157(in_ptr0, in_ptr1, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 56 | |
rnumel = 13 | |
RBLOCK: tl.constexpr = 16 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (56*r1)), rmask & xmask, other=0) | |
tmp4 = tl.load(in_ptr1 + (x0), xmask) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp5 = tmp3 * tmp4 | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask) | |
tl.store(out_ptr0 + x0, tmp3, xmask) | |
def get_args(): | |
arg_0 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_157.run(*args, 56, 13, grid=grid(56), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_157.benchmark_all_configs(*args, 56, 13, grid=grid(56)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/bj/cbjvyothjvn62eovj2qvbnhqdnxtnyarsgyu3rshwon5wrxfm6me.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_120 | |
# aten.native_batch_norm_backward => convert_element_type_683, convert_element_type_685, mul_965, mul_966, sub_254, sub_256, sub_257 | |
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_158 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp16', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_158(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 5619712 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 784) % 56 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
tmp4 = tl.load(in_ptr2 + (x1), None) | |
tmp6 = tl.load(in_ptr3 + (x1), None) | |
tmp9 = tl.load(in_ptr4 + (x1), None) | |
tmp14 = tl.load(in_ptr5 + (x1), None) | |
tmp17 = tl.load(in_ptr6 + (x1), None) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp2.to(tl.float32) | |
tmp5 = tmp3 - tmp4 | |
tmp7 = 9.964923469387754e-06 | |
tmp8 = tmp6 * tmp7 | |
tmp10 = tmp9 * tmp9 | |
tmp11 = tmp8 * tmp10 | |
tmp12 = tmp5 * tmp11 | |
tmp13 = tmp1 - tmp12 | |
tmp15 = tmp14 * tmp7 | |
tmp16 = tmp13 - tmp15 | |
tmp18 = tmp9 * tmp17 | |
tmp19 = tmp16 * tmp18 | |
tmp20 = tmp19.to(tl.float32) | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp20, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((1, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_158.run(*args, 5619712, grid=grid(5619712), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_158.benchmark_all_configs(*args, 5619712, grid=grid(5619712)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ip/cipdkjyk2onxl3m3jynzjqksmm63qjqd3qbbbq47kgdhlig5ktev.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_686 | |
triton_poi_fused__to_copy_159 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_159(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 4704 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_159.run(*args, 4704, grid=grid(4704), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_159.benchmark_all_configs(*args, 4704, grid=grid(4704)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/hk/chko7h4vzts7m5h2r3fcixp646xti7wwu6pppgto4mg3xqc3ieno.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_67 | |
triton_poi_fused_cat_160 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_160(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 16859136 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 131712 | |
x1 = (xindex // 131712) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (263424*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 168, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_160.run(*args, 16859136, grid=grid(16859136), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_160.benchmark_all_configs(*args, 16859136, grid=grid(16859136)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/wf/cwfwcfiqllapezhy4wsh7mb3i7a2codkip2cnbeie26v347spvpq.py | |
# Original ATen: aten.mul, aten.sigmoid, aten.sigmoid_backward, aten.silu, aten.sum | |
# aten.mul => mul_968 | |
# aten.sigmoid => sigmoid_15 | |
# aten.sigmoid_backward => convert_element_type_688, convert_element_type_689, convert_element_type_690, mul_970, mul_971, sub_258 | |
# aten.silu => convert_element_type_110, convert_element_type_111, mul_153, sigmoid_13 | |
# aten.sum => sum_114 | |
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_161 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[65536, 1024], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_161(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 43008 | |
rnumel = 784 | |
RBLOCK: tl.constexpr = 1024 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (r1 + (784*x0)), rmask, other=0).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (r1 + (784*x0)), rmask, other=0).to(tl.float32) | |
tmp11 = tl.load(in_ptr2 + (x0), None).to(tl.float32) | |
tmp2 = tmp1.to(tl.float32) | |
tmp3 = tl.sigmoid(tmp2) | |
tmp4 = tmp2 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tmp6 = tmp0 * tmp5 | |
tmp8 = tl.where(rmask, tmp6, 0) | |
tmp9 = tl.sum(tmp8, 1)[:, None] | |
tmp10 = tmp9.to(tl.float32) | |
tmp12 = tl.sigmoid(tmp11) | |
tmp13 = tmp12.to(tl.float32) | |
tmp14 = 1.0 | |
tmp15 = tmp14 - tmp13 | |
tmp16 = tmp13 * tmp15 | |
tmp17 = tmp10 * tmp16 | |
tmp18 = tmp17.to(tl.float32) | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp18, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_161.run(*args, 43008, 784, grid=grid(43008), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_161.benchmark_all_configs(*args, 43008, 784, grid=grid(43008)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/h6/ch66mav6cpspgg34bc2ckyogaizco43c7rwo2n4rkshr5666vpid.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_691 | |
triton_poi_fused__to_copy_162 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_162(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 9408 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((336, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((336, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_162.run(*args, 9408, grid=grid(9408), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_162.benchmark_all_configs(*args, 9408, grid=grid(9408)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/fi/cfibawhd6jznqxl7q2atcugp7zx43kt5jtkntbghy42ljr2ytlun.py | |
# Original ATen: aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub | |
# aten.add => add_361 | |
# aten.clone => clone_11 | |
# aten.fill => full_like_36 | |
# aten.mul => mul_972, mul_973, mul_974 | |
# aten.sigmoid => sigmoid_100 | |
# aten.sub => sub_259 | |
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_163 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_add_clone_fill_mul_sigmoid_sub_163(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3584 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = 1.0 | |
tmp4 = tmp3 - tmp2 | |
tmp5 = tmp1 * tmp4 | |
tmp6 = tmp5 + tmp3 | |
tmp7 = tmp2 * tmp6 | |
tmp8 = tmp0 * tmp7 | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp8, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_163.run(*args, 3584, grid=grid(3584), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_add_clone_fill_mul_sigmoid_sub_163.benchmark_all_configs(*args, 3584, grid=grid(3584)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/yv/cyvint5v6fejjcxz54v5axwxdzytfpjmq7eoqsppyu45zsqim3l2.py | |
# Original ATen: aten._to_copy, aten.convolution_backward | |
# aten._to_copy => convert_element_type_694 | |
# aten.convolution_backward => sum_116 | |
triton_per_fused__to_copy_convolution_backward_164 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[32, 128], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 3), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__to_copy_convolution_backward_164(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 28 | |
rnumel = 128 | |
RBLOCK: tl.constexpr = 128 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (28*r1)), rmask & xmask, other=0).to(tl.float32) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((28,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__to_copy_convolution_backward_164.run(*args, 28, 128, grid=grid(28), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__to_copy_convolution_backward_164.benchmark_all_configs(*args, 28, 128, grid=grid(28)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/pk/cpkfpj7moawggcqkv5ok3cwxbz2nv6myxthuduj53xnw6qw3dwig.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_693 | |
triton_poi_fused__to_copy_165 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_165(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 9408 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((28, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((28, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_165.run(*args, 9408, grid=grid(9408), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_165.benchmark_all_configs(*args, 9408, grid=grid(9408)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/q7/cq7zl54kvzo64w2eq2dfuf5wrsv6grvd6fo3lgq77a5kcc7cs6ek.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub | |
# aten._native_batch_norm_legit_functional => convert_element_type_108 | |
# aten.add => add_362, add_363 | |
# aten.clone => clone_10 | |
# aten.div => div_13 | |
# aten.fill => full_like_37 | |
# aten.mul => mul_969, mul_975, mul_976, mul_977 | |
# aten.native_batch_norm_backward => convert_element_type_695, mul_978, mul_986, sub_261, sum_117, sum_118 | |
# aten.sigmoid => sigmoid_101, sigmoid_15 | |
# aten.sub => sub_260 | |
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_166 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[512, 131072], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_166(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 336 | |
rnumel = 100352 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
tmp20 = tl.load(in_ptr5 + (x0), xmask) | |
_tmp23 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 784 | |
r2 = (rindex // 784) | |
tmp0 = tl.load(in_ptr0 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (x0 + (336*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp4 = tl.load(in_ptr2 + (x0 + (336*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp8 = tl.load(in_ptr3 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp18 = tl.load(in_ptr4 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp0 * tmp2 | |
tmp5 = 784.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = tmp3 + tmp6 | |
tmp9 = tl.sigmoid(tmp8) | |
tmp10 = 1.0 | |
tmp11 = tmp10 - tmp9 | |
tmp12 = tmp8 * tmp11 | |
tmp13 = tmp12 + tmp10 | |
tmp14 = tmp9 * tmp13 | |
tmp15 = tmp7 * tmp14 | |
tmp16 = tmp15.to(tl.float32) | |
_tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17) | |
tmp19 = tmp18.to(tl.float32) | |
tmp21 = tmp19 - tmp20 | |
tmp22 = tmp16 * tmp21 | |
_tmp23 = tl.where(rmask & xmask, _tmp23 + tmp22, _tmp23) | |
tmp17 = tl.sum(_tmp17, 1)[:, None] | |
tl.store(out_ptr0 + x0, tmp17, xmask) | |
tmp23 = tl.sum(_tmp23, 1)[:, None] | |
tl.store(out_ptr1 + x0, tmp23, xmask) | |
tmp24 = tl.load(in_ptr6 + (x0), xmask) | |
tmp25 = tmp23 * tmp24 | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp25, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_5 = rand_strided((1, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_9 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_166.run(*args, 336, 100352, grid=grid(336), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_166.benchmark_all_configs(*args, 336, 100352, grid=grid(336)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/rc/crcc3il2qglpofbslbuwlevrp5dihafxcr4wxlvjfxaasqpi56hu.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub | |
# aten._native_batch_norm_legit_functional => convert_element_type_108 | |
# aten.add => add_362, add_363 | |
# aten.clone => clone_10 | |
# aten.div => div_13 | |
# aten.fill => full_like_37 | |
# aten.mul => mul_969, mul_975, mul_976, mul_977 | |
# aten.native_batch_norm_backward => convert_element_type_695, mul_984, sub_261, sub_263, sub_264 | |
# aten.sigmoid => sigmoid_101, sigmoid_15 | |
# aten.sub => sub_260 | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_167 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_167(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 33718272 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x4 = (xindex // 784) | |
x1 = (xindex // 784) % 336 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32) | |
tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32) | |
tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32) | |
tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32) | |
tmp19 = tl.load(in_ptr5 + (x1), None) | |
tmp21 = tl.load(in_ptr6 + (x1), None) | |
tmp24 = tl.load(in_ptr7 + (x1), None) | |
tmp29 = tl.load(in_ptr8 + (x1), None) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp0 * tmp2 | |
tmp5 = 784.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = tmp3 + tmp6 | |
tmp9 = tl.sigmoid(tmp8) | |
tmp10 = 1.0 | |
tmp11 = tmp10 - tmp9 | |
tmp12 = tmp8 * tmp11 | |
tmp13 = tmp12 + tmp10 | |
tmp14 = tmp9 * tmp13 | |
tmp15 = tmp7 * tmp14 | |
tmp16 = tmp15.to(tl.float32) | |
tmp18 = tmp17.to(tl.float32) | |
tmp20 = tmp18 - tmp19 | |
tmp22 = 9.964923469387754e-06 | |
tmp23 = tmp21 * tmp22 | |
tmp25 = tmp24 * tmp24 | |
tmp26 = tmp23 * tmp25 | |
tmp27 = tmp20 * tmp26 | |
tmp28 = tmp16 - tmp27 | |
tmp30 = tmp29 * tmp22 | |
tmp31 = tmp28 - tmp30 | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp31, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_5 = rand_strided((1, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_9 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_167.run(*args, 33718272, grid=grid(33718272), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_167.benchmark_all_configs(*args, 33718272, grid=grid(33718272)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/7y/c7y3vo54x6g4hqnvyjz4k3ngokc66wmd3u7zsbjrmhorsy53o4sb.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_112 | |
triton_poi_fused_convolution_backward_168 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_168(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 16859136 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = (xindex // 131712) | |
x3 = xindex % 131712 | |
x1 = (xindex // 784) % 168 | |
x4 = xindex | |
tmp0 = tl.load(in_ptr0 + (131712 + x3 + (263424*x2)), None) | |
tmp1 = tl.load(in_ptr1 + (168 + x1), None) | |
tmp2 = tl.load(in_ptr2 + (168 + x1), None) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp0 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_168.run(*args, 16859136, grid=grid(16859136), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_168.benchmark_all_configs(*args, 16859136, grid=grid(16859136)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/6n/c6nredjp54hkor7tbo72xhpqqat3vpewekmkr4kkyypw6cgqm6ai.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_698 | |
triton_poi_fused__to_copy_169 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_169(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 4200 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((168, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((168, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_169.run(*args, 4200, grid=grid(4200), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_169.benchmark_all_configs(*args, 4200, grid=grid(4200)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ia/ciabi2inw7dvbjd5nathz3xfqaodfqjylvjwgsdeimxatolwwqks.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_113 | |
triton_poi_fused_convolution_backward_170 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_170(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 16859136 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = (xindex // 131712) | |
x3 = xindex % 131712 | |
x1 = (xindex // 784) % 168 | |
x4 = xindex | |
tmp0 = tl.load(in_ptr0 + (x3 + (263424*x2)), None) | |
tmp1 = tl.load(in_ptr1 + (x1), None) | |
tmp2 = tl.load(in_ptr2 + (x1), None) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp0 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_170.run(*args, 16859136, grid=grid(16859136), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_170.benchmark_all_configs(*args, 16859136, grid=grid(16859136)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/c4/cc4n2v6wqwhmmrx6e37v63wrmsqntez34qcciaujxy6fxsa2gk3o.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_699 | |
triton_poi_fused__to_copy_171 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_171(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1512 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((168, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((168, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_171.run(*args, 1512, grid=grid(1512), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_171.benchmark_all_configs(*args, 1512, grid=grid(1512)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ak/cakbaw2uk6hhpcroelbhhfo57dmqlxwcgkogm2m6ds3sv57t2vqz.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_703 | |
triton_poi_fused__to_copy_172 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_172(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 4704 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_172.run(*args, 4704, grid=grid(4704), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_172.benchmark_all_configs(*args, 4704, grid=grid(4704)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ow/cowu2kdmjh4mnuy3o2jt5hyxyjhq6tnzxq4b3gah2vdhgpih4okk.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_69 | |
triton_poi_fused_cat_173 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_173(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2809856 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 21952 | |
x1 = (xindex // 21952) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (43904*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 28, 28, 28), (21952, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 28, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_173.run(*args, 2809856, grid=grid(2809856), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_173.benchmark_all_configs(*args, 2809856, grid=grid(2809856)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/tu/ctuwkh7mmjcoxpqhlozf53j6kx7nu6pcnpqqmwor4d57vrqdsosn.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_98 | |
# aten.add => add_365 | |
# aten.native_batch_norm_backward => convert_element_type_705, mul_999, sub_270, sum_121, sum_122 | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_174 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[1024, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: 'i32', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_174(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 728 | |
rnumel = 7720 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x1 = (xindex // 56) | |
x0 = xindex % 56 | |
_tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = r2 + (7720*x1) | |
tmp1 = 100352 | |
tmp2 = tmp0 < tmp1 | |
tmp3 = tl.load(in_ptr0 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp4 = tl.load(in_ptr1 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp5 = tmp3 + tmp4 | |
tmp6 = tmp5.to(tl.float32) | |
tmp7 = tl.where(tmp2, tmp6, 0) | |
_tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8) | |
tmp9 = tl.load(in_ptr2 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp10 = tmp9.to(tl.float32) | |
tmp11 = tl.load(in_ptr3 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0) | |
tmp12 = tmp10 - tmp11 | |
tmp13 = tmp6 * tmp12 | |
tmp14 = tl.where(tmp2, tmp13, 0) | |
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15) | |
tmp8 = tl.sum(_tmp8, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp8, xmask) | |
tmp15 = tl.sum(_tmp15, 1)[:, None] | |
tl.store(out_ptr1 + x3, tmp15, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_174.run(*args, 728, 7720, grid=grid(728), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_174.benchmark_all_configs(*args, 728, 7720, grid=grid(728)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/3q/c3qu2jno53qez6kspx5ta57fq4vavwnpqfdqbctsz2h5gfllev2o.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_98 | |
# aten.add => add_365 | |
# aten.native_batch_norm_backward => convert_element_type_705, convert_element_type_707, mul_1005, mul_1006, sub_270, sub_272, sub_273 | |
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_175 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp16', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_175(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 5619712 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 784) % 56 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
tmp4 = tl.load(in_ptr2 + (x3), None).to(tl.float32) | |
tmp6 = tl.load(in_ptr3 + (x1), None) | |
tmp8 = tl.load(in_ptr4 + (x1), None) | |
tmp11 = tl.load(in_ptr5 + (x1), None) | |
tmp16 = tl.load(in_ptr6 + (x1), None) | |
tmp19 = tl.load(in_ptr7 + (x1), None) | |
tmp2 = tmp0 + tmp1 | |
tmp3 = tmp2.to(tl.float32) | |
tmp5 = tmp4.to(tl.float32) | |
tmp7 = tmp5 - tmp6 | |
tmp9 = 9.964923469387754e-06 | |
tmp10 = tmp8 * tmp9 | |
tmp12 = tmp11 * tmp11 | |
tmp13 = tmp10 * tmp12 | |
tmp14 = tmp7 * tmp13 | |
tmp15 = tmp3 - tmp14 | |
tmp17 = tmp16 * tmp9 | |
tmp18 = tmp15 - tmp17 | |
tmp20 = tmp11 * tmp19 | |
tmp21 = tmp18 * tmp20 | |
tmp22 = tmp21.to(tl.float32) | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_175.run(*args, 5619712, grid=grid(5619712), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_175.benchmark_all_configs(*args, 5619712, grid=grid(5619712)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/he/cheuhrmgcxe3wl2kq52bt7crzek5t4ji6b56jrybsdffzrqao7el.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_76 | |
# aten.add => add_365, add_370 | |
# aten.native_batch_norm_backward => convert_element_type_727, mul_1039, sub_286, sum_130, sum_131 | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_176 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[1024, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_176(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 728 | |
rnumel = 7720 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x1 = (xindex // 56) | |
x0 = xindex % 56 | |
_tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
_tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = r2 + (7720*x1) | |
tmp1 = 100352 | |
tmp2 = tmp0 < tmp1 | |
tmp3 = tl.load(in_ptr0 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp4 = tl.load(in_ptr1 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp5 = tmp3 + tmp4 | |
tmp6 = tl.load(in_ptr2 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp7 = tmp5 + tmp6 | |
tmp8 = tmp7.to(tl.float32) | |
tmp9 = tl.where(tmp2, tmp8, 0) | |
_tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10) | |
tmp11 = tl.load(in_ptr3 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp12 = tmp11.to(tl.float32) | |
tmp13 = tl.load(in_ptr4 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0) | |
tmp14 = tmp12 - tmp13 | |
tmp15 = tmp8 * tmp14 | |
tmp16 = tl.where(tmp2, tmp15, 0) | |
_tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17) | |
tmp10 = tl.sum(_tmp10, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp10, xmask) | |
tmp17 = tl.sum(_tmp17, 1)[:, None] | |
tl.store(out_ptr1 + x3, tmp17, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((1, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_176.run(*args, 728, 7720, grid=grid(728), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_176.benchmark_all_configs(*args, 728, 7720, grid=grid(728)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/6w/c6wk3ggkhqc4dpm4svd2nw2x7bsbl6xzqqaoct6gtlyubuviyxj6.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_76 | |
# aten.add => add_365, add_370 | |
# aten.native_batch_norm_backward => convert_element_type_727, mul_1045, mul_1046, sub_286, sub_288, sub_289 | |
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_177 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_177(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 5619712 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 784) % 56 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
tmp3 = tl.load(in_ptr2 + (x3), None).to(tl.float32) | |
tmp6 = tl.load(in_ptr3 + (x3), None).to(tl.float32) | |
tmp8 = tl.load(in_ptr4 + (x1), None) | |
tmp10 = tl.load(in_ptr5 + (x1), None) | |
tmp13 = tl.load(in_ptr6 + (x1), None) | |
tmp18 = tl.load(in_ptr7 + (x1), None) | |
tmp21 = tl.load(in_ptr8 + (x1), None) | |
tmp2 = tmp0 + tmp1 | |
tmp4 = tmp2 + tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tmp7 = tmp6.to(tl.float32) | |
tmp9 = tmp7 - tmp8 | |
tmp11 = 9.964923469387754e-06 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp13 * tmp13 | |
tmp15 = tmp12 * tmp14 | |
tmp16 = tmp9 * tmp15 | |
tmp17 = tmp5 - tmp16 | |
tmp19 = tmp18 * tmp11 | |
tmp20 = tmp17 - tmp19 | |
tmp22 = tmp13 * tmp21 | |
tmp23 = tmp20 * tmp22 | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp23, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((1, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_9 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_177.run(*args, 5619712, grid=grid(5619712), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_177.benchmark_all_configs(*args, 5619712, grid=grid(5619712)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/za/czavvoeeqv63efdv32prg5yimwonjxhjdak5qwnkzrdkukuwv7ls.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_124 | |
triton_poi_fused_convolution_backward_178 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_178(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2809856 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 21952 | |
x1 = (xindex // 21952) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (21952 + x0 + (43904*x1)), None) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((128, 28, 28, 28), (21952, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_178.run(*args, 2809856, grid=grid(2809856), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_178.benchmark_all_configs(*args, 2809856, grid=grid(2809856)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/4j/c4jmqpkqr4fnrmxwsxtkllpzeruqpmftklalesh626aid62ltfs5.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_125 | |
triton_poi_fused_convolution_backward_179 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_179(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2809856 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 21952 | |
x1 = (xindex // 21952) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (43904*x1)), None) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((128, 28, 28, 28), (21952, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_179.run(*args, 2809856, grid=grid(2809856), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_179.benchmark_all_configs(*args, 2809856, grid=grid(2809856)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/sd/csdu62ubjpfueiorjsdrofpul6ahwt73a2qyxlds2id4dzng2ykk.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_54 | |
# aten.add => add_365, add_370, add_375 | |
# aten.native_batch_norm_backward => convert_element_type_749, mul_1079, sub_302, sum_139, sum_140 | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_180 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[1024, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_180(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 728 | |
rnumel = 7720 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x1 = (xindex // 56) | |
x0 = xindex % 56 | |
_tmp12 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
_tmp19 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = r2 + (7720*x1) | |
tmp1 = 100352 | |
tmp2 = tmp0 < tmp1 | |
tmp3 = tl.load(in_ptr0 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp4 = tl.load(in_ptr1 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp5 = tmp3 + tmp4 | |
tmp6 = tl.load(in_ptr2 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp7 = tmp5 + tmp6 | |
tmp8 = tl.load(in_ptr3 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp9 = tmp7 + tmp8 | |
tmp10 = tmp9.to(tl.float32) | |
tmp11 = tl.where(tmp2, tmp10, 0) | |
_tmp12 = tl.where(rmask & xmask, _tmp12 + tmp11, _tmp12) | |
tmp13 = tl.load(in_ptr4 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp14 = tmp13.to(tl.float32) | |
tmp15 = tl.load(in_ptr5 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0) | |
tmp16 = tmp14 - tmp15 | |
tmp17 = tmp10 * tmp16 | |
tmp18 = tl.where(tmp2, tmp17, 0) | |
_tmp19 = tl.where(rmask & xmask, _tmp19 + tmp18, _tmp19) | |
tmp12 = tl.sum(_tmp12, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp12, xmask) | |
tmp19 = tl.sum(_tmp19, 1)[:, None] | |
tl.store(out_ptr1 + x3, tmp19, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_5 = rand_strided((1, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_180.run(*args, 728, 7720, grid=grid(728), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_180.benchmark_all_configs(*args, 728, 7720, grid=grid(728)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/tj/ctjcu5336nwmvr63gcfk75rymyzm6mnak553tgmn3erorw7u2qrh.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.convolution_backward, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_54 | |
# aten.add => add_365, add_370, add_375 | |
# aten.convolution_backward => convolution_backward_132 | |
# aten.native_batch_norm_backward => convert_element_type_749, convert_element_type_751, mul_1085, mul_1086, sub_302, sub_304, sub_305 | |
triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_181 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: '*fp16', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_181(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, in_ptr9, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 5619712 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 784) % 56 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
tmp3 = tl.load(in_ptr2 + (x3), None).to(tl.float32) | |
tmp5 = tl.load(in_ptr3 + (x3), None).to(tl.float32) | |
tmp8 = tl.load(in_ptr4 + (x3), None).to(tl.float32) | |
tmp10 = tl.load(in_ptr5 + (x1), None) | |
tmp12 = tl.load(in_ptr6 + (x1), None) | |
tmp15 = tl.load(in_ptr7 + (x1), None) | |
tmp20 = tl.load(in_ptr8 + (x1), None) | |
tmp23 = tl.load(in_ptr9 + (x1), None) | |
tmp2 = tmp0 + tmp1 | |
tmp4 = tmp2 + tmp3 | |
tmp6 = tmp4 + tmp5 | |
tmp7 = tmp6.to(tl.float32) | |
tmp9 = tmp8.to(tl.float32) | |
tmp11 = tmp9 - tmp10 | |
tmp13 = 9.964923469387754e-06 | |
tmp14 = tmp12 * tmp13 | |
tmp16 = tmp15 * tmp15 | |
tmp17 = tmp14 * tmp16 | |
tmp18 = tmp11 * tmp17 | |
tmp19 = tmp7 - tmp18 | |
tmp21 = tmp20 * tmp13 | |
tmp22 = tmp19 - tmp21 | |
tmp24 = tmp15 * tmp23 | |
tmp25 = tmp22 * tmp24 | |
tmp26 = tmp25.to(tl.float32) | |
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp26, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_5 = rand_strided((1, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_9 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_10 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, arg_10, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_181.run(*args, 5619712, grid=grid(5619712), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_181.benchmark_all_configs(*args, 5619712, grid=grid(5619712)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ss/cssgcqqf6ombucovh6ytzd2ykd72qsuomzscm44okrt6cxuozbc5.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_752 | |
triton_poi_fused__to_copy_182 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_182(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 13440 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((56, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((56, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_182.run(*args, 13440, grid=grid(13440), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_182.benchmark_all_configs(*args, 13440, grid=grid(13440)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/of/cofgkmtacwz7qwl7fcbig7f7isuaew6fzcwmo2bioxxyxqipb2vy.py | |
# Original ATen: aten.mul, aten.sigmoid, aten.sigmoid_backward, aten.silu, aten.sum | |
# aten.mul => mul_1088 | |
# aten.sigmoid => sigmoid_3 | |
# aten.sigmoid_backward => convert_element_type_753, convert_element_type_754, convert_element_type_755, mul_1090, mul_1091, sub_306 | |
# aten.silu => convert_element_type_45, convert_element_type_46, mul_78, sigmoid_1 | |
# aten.sum => sum_141 | |
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_183 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[32768, 1024], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_183(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 30720 | |
rnumel = 784 | |
RBLOCK: tl.constexpr = 1024 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (r1 + (784*x0)), rmask, other=0).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (r1 + (784*x0)), rmask, other=0).to(tl.float32) | |
tmp11 = tl.load(in_ptr2 + (x0), None).to(tl.float32) | |
tmp2 = tmp1.to(tl.float32) | |
tmp3 = tl.sigmoid(tmp2) | |
tmp4 = tmp2 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tmp6 = tmp0 * tmp5 | |
tmp8 = tl.where(rmask, tmp6, 0) | |
tmp9 = tl.sum(tmp8, 1)[:, None] | |
tmp10 = tmp9.to(tl.float32) | |
tmp12 = tl.sigmoid(tmp11) | |
tmp13 = tmp12.to(tl.float32) | |
tmp14 = 1.0 | |
tmp15 = tmp14 - tmp13 | |
tmp16 = tmp13 * tmp15 | |
tmp17 = tmp10 * tmp16 | |
tmp18 = tmp17.to(tl.float32) | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp18, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_183.run(*args, 30720, 784, grid=grid(30720), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_183.benchmark_all_configs(*args, 30720, 784, grid=grid(30720)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/2a/c2ahf72g5w6o4b2o5wzvh7vjnff55vwxvyrwbchmf6do2jwrknlq.py | |
# Original ATen: aten._to_copy, aten.convolution_backward | |
# aten._to_copy => convert_element_type_757 | |
# aten.convolution_backward => sum_142 | |
triton_per_fused__to_copy_convolution_backward_184 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[256, 128], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__to_copy_convolution_backward_184(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 240 | |
rnumel = 128 | |
RBLOCK: tl.constexpr = 128 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (240*r1)), rmask & xmask, other=0).to(tl.float32) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__to_copy_convolution_backward_184.run(*args, 240, 128, grid=grid(240), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__to_copy_convolution_backward_184.benchmark_all_configs(*args, 240, 128, grid=grid(240)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/qt/cqtwdt25yvairvvorg5m6gz2cdhgchboky6qw4dfwm5pyezhyucw.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_756 | |
triton_poi_fused__to_copy_185 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_185(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 4800 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((240, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((240, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_185.run(*args, 4800, grid=grid(4800), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_185.benchmark_all_configs(*args, 4800, grid=grid(4800)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/wr/cwrjuglpb4gznmuaxv7k3zlp7pbp5w2juaztlowvwrq4a5mtkqsj.py | |
# Original ATen: aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub | |
# aten.add => add_376 | |
# aten.clone => clone_2 | |
# aten.fill => full_like_45 | |
# aten.mul => mul_1092, mul_1093, mul_1094 | |
# aten.sigmoid => sigmoid_109 | |
# aten.sub => sub_307 | |
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_186 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_add_clone_fill_mul_sigmoid_sub_186(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2560 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = 1.0 | |
tmp4 = tmp3 - tmp2 | |
tmp5 = tmp1 * tmp4 | |
tmp6 = tmp5 + tmp3 | |
tmp7 = tmp2 * tmp6 | |
tmp8 = tmp0 * tmp7 | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp8, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_186.run(*args, 2560, grid=grid(2560), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_add_clone_fill_mul_sigmoid_sub_186.benchmark_all_configs(*args, 2560, grid=grid(2560)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/fl/cflhjweuqyd43bdk6ghk435lvc4aws6crjlokqtpnsbzf7ks6xms.py | |
# Original ATen: aten._to_copy, aten.convolution_backward | |
# aten._to_copy => convert_element_type_759 | |
# aten.convolution_backward => sum_143 | |
triton_per_fused__to_copy_convolution_backward_187 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[32, 128], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 3), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__to_copy_convolution_backward_187(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 20 | |
rnumel = 128 | |
RBLOCK: tl.constexpr = 128 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (20*r1)), rmask & xmask, other=0).to(tl.float32) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((20,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__to_copy_convolution_backward_187.run(*args, 20, 128, grid=grid(20), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__to_copy_convolution_backward_187.benchmark_all_configs(*args, 20, 128, grid=grid(20)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/a2/ca25x7sc4v36iwhjlvkmxifjmfddvl7462a5hlf5ax6a3bk4qpxv.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_758 | |
triton_poi_fused__to_copy_188 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_188(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 4800 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((20, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((20, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_188.run(*args, 4800, grid=grid(4800), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_188.benchmark_all_configs(*args, 4800, grid=grid(4800)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/6h/c6hub7cmnc2lps6dhzvt7jhvxrb62mqi6pacmt47atzmnjv2ubac.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub | |
# aten._native_batch_norm_legit_functional => convert_element_type_43 | |
# aten.add => add_377, add_378 | |
# aten.clone => clone_1 | |
# aten.div => div_16 | |
# aten.fill => full_like_46 | |
# aten.mul => mul_1089, mul_1095, mul_1096, mul_1097 | |
# aten.native_batch_norm_backward => convert_element_type_760, mul_1098, mul_1106, sub_309, sum_144, sum_145 | |
# aten.sigmoid => sigmoid_110, sigmoid_3 | |
# aten.sub => sub_308 | |
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_189 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[256, 131072], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_189(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 240 | |
rnumel = 100352 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
tmp20 = tl.load(in_ptr5 + (x0), xmask) | |
_tmp23 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 784 | |
r2 = (rindex // 784) | |
tmp0 = tl.load(in_ptr0 + (r1 + (784*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (x0 + (240*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp4 = tl.load(in_ptr2 + (x0 + (240*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp8 = tl.load(in_ptr3 + (r1 + (784*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp18 = tl.load(in_ptr4 + (r1 + (784*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp0 * tmp2 | |
tmp5 = 784.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = tmp3 + tmp6 | |
tmp9 = tl.sigmoid(tmp8) | |
tmp10 = 1.0 | |
tmp11 = tmp10 - tmp9 | |
tmp12 = tmp8 * tmp11 | |
tmp13 = tmp12 + tmp10 | |
tmp14 = tmp9 * tmp13 | |
tmp15 = tmp7 * tmp14 | |
tmp16 = tmp15.to(tl.float32) | |
_tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17) | |
tmp19 = tmp18.to(tl.float32) | |
tmp21 = tmp19 - tmp20 | |
tmp22 = tmp16 * tmp21 | |
_tmp23 = tl.where(rmask & xmask, _tmp23 + tmp22, _tmp23) | |
tmp17 = tl.sum(_tmp17, 1)[:, None] | |
tl.store(out_ptr0 + x0, tmp17, xmask) | |
tmp23 = tl.sum(_tmp23, 1)[:, None] | |
tl.store(out_ptr1 + x0, tmp23, xmask) | |
tmp24 = tl.load(in_ptr6 + (x0), xmask) | |
tmp25 = tmp23 * tmp24 | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp25, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_5 = rand_strided((1, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_9 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_189.run(*args, 240, 100352, grid=grid(240), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_189.benchmark_all_configs(*args, 240, 100352, grid=grid(240)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/b4/cb4sfko255cmn5essh46qplwjuzpmcv7vbfk26rbkic7p6z2ckjv.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub | |
# aten._native_batch_norm_legit_functional => convert_element_type_43 | |
# aten.add => add_377, add_378 | |
# aten.clone => clone_1 | |
# aten.div => div_16 | |
# aten.fill => full_like_46 | |
# aten.mul => mul_1089, mul_1095, mul_1096, mul_1097 | |
# aten.native_batch_norm_backward => convert_element_type_760, mul_1104, sub_309, sub_311, sub_312 | |
# aten.sigmoid => sigmoid_110, sigmoid_3 | |
# aten.sub => sub_308 | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_190 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_190(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 24084480 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x4 = (xindex // 784) | |
x1 = (xindex // 784) % 240 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32) | |
tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32) | |
tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32) | |
tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32) | |
tmp19 = tl.load(in_ptr5 + (x1), None) | |
tmp21 = tl.load(in_ptr6 + (x1), None) | |
tmp24 = tl.load(in_ptr7 + (x1), None) | |
tmp29 = tl.load(in_ptr8 + (x1), None) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp0 * tmp2 | |
tmp5 = 784.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = tmp3 + tmp6 | |
tmp9 = tl.sigmoid(tmp8) | |
tmp10 = 1.0 | |
tmp11 = tmp10 - tmp9 | |
tmp12 = tmp8 * tmp11 | |
tmp13 = tmp12 + tmp10 | |
tmp14 = tmp9 * tmp13 | |
tmp15 = tmp7 * tmp14 | |
tmp16 = tmp15.to(tl.float32) | |
tmp18 = tmp17.to(tl.float32) | |
tmp20 = tmp18 - tmp19 | |
tmp22 = 9.964923469387754e-06 | |
tmp23 = tmp21 * tmp22 | |
tmp25 = tmp24 * tmp24 | |
tmp26 = tmp23 * tmp25 | |
tmp27 = tmp20 * tmp26 | |
tmp28 = tmp16 - tmp27 | |
tmp30 = tmp29 * tmp22 | |
tmp31 = tmp28 - tmp30 | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp31, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_5 = rand_strided((1, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_9 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_190.run(*args, 24084480, grid=grid(24084480), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_190.benchmark_all_configs(*args, 24084480, grid=grid(24084480)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/64/c64lskcy7vo32z6iwoy7sbqjaz3shz2zgjiddxnernpnf42f5kgu.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_135 | |
triton_poi_fused_convolution_backward_191 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_191(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 6021120 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = (xindex // 47040) | |
x3 = xindex % 47040 | |
x1 = (xindex // 784) % 60 | |
x4 = xindex | |
tmp0 = tl.load(in_ptr0 + (141120 + x3 + (188160*x2)), None) | |
tmp1 = tl.load(in_ptr1 + (180 + x1), None) | |
tmp2 = tl.load(in_ptr2 + (180 + x1), None) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp0 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((128, 60, 28, 28), (47040, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_191.run(*args, 6021120, grid=grid(6021120), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_191.benchmark_all_configs(*args, 6021120, grid=grid(6021120)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/q5/cq54zrcq5terxwt6k525p5rp3nkt2xgc6fiu5byc3io2seruy4pn.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_763 | |
triton_poi_fused__to_copy_192 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_192(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 4860 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((60, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((60, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_192.run(*args, 4860, grid=grid(4860), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_192.benchmark_all_configs(*args, 4860, grid=grid(4860)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/br/cbrxmq6ibqs3puj5yvt2cuwvf6skj4stjxuuq7wvmk57lhi6a5vj.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_136 | |
triton_poi_fused_convolution_backward_193 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_193(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 6021120 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = (xindex // 47040) | |
x3 = xindex % 47040 | |
x1 = (xindex // 784) % 60 | |
x4 = xindex | |
tmp0 = tl.load(in_ptr0 + (94080 + x3 + (188160*x2)), None) | |
tmp1 = tl.load(in_ptr1 + (120 + x1), None) | |
tmp2 = tl.load(in_ptr2 + (120 + x1), None) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp0 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((128, 60, 28, 28), (47040, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_193.run(*args, 6021120, grid=grid(6021120), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_193.benchmark_all_configs(*args, 6021120, grid=grid(6021120)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/3b/c3bhkoe2gl2ylb7b67k4e5uph44ovk7dmdflgyqj7mj6bgzsfash.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_764 | |
triton_poi_fused__to_copy_194 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_194(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2940 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((60, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((60, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_194.run(*args, 2940, grid=grid(2940), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_194.benchmark_all_configs(*args, 2940, grid=grid(2940)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/na/cnasnkl2z4au7fpcqaxw5gcnnevjoribcdlpepbjqn6x74n4itra.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_137 | |
triton_poi_fused_convolution_backward_195 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_195(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 6021120 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = (xindex // 47040) | |
x3 = xindex % 47040 | |
x1 = (xindex // 784) % 60 | |
x4 = xindex | |
tmp0 = tl.load(in_ptr0 + (47040 + x3 + (188160*x2)), None) | |
tmp1 = tl.load(in_ptr1 + (60 + x1), None) | |
tmp2 = tl.load(in_ptr2 + (60 + x1), None) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp0 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((128, 60, 28, 28), (47040, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_195.run(*args, 6021120, grid=grid(6021120), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_195.benchmark_all_configs(*args, 6021120, grid=grid(6021120)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/zs/czsqzrqr2u3q2yqg57a3bpoxcvbsw7bfuy7uqz7stgjemnm27ezs.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_765 | |
triton_poi_fused__to_copy_196 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_196(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1500 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((60, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((60, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_196.run(*args, 1500, grid=grid(1500), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_196.benchmark_all_configs(*args, 1500, grid=grid(1500)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/nq/cnqeg2qpm6hnr3euqz4jxglt3pvrermthkbedv7k5wva44ox64xi.py | |
# Original ATen: aten.convolution_backward | |
# aten.convolution_backward => convolution_backward_138 | |
triton_poi_fused_convolution_backward_197 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_convolution_backward_197(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 6021120 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = (xindex // 47040) | |
x3 = xindex % 47040 | |
x1 = (xindex // 784) % 60 | |
x4 = xindex | |
tmp0 = tl.load(in_ptr0 + (x3 + (188160*x2)), None) | |
tmp1 = tl.load(in_ptr1 + (x1), None) | |
tmp2 = tl.load(in_ptr2 + (x1), None) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp0 * tmp3 | |
tmp5 = tmp4.to(tl.float32) | |
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((128, 60, 28, 28), (47040, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_convolution_backward_197.run(*args, 6021120, grid=grid(6021120), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_convolution_backward_197.benchmark_all_configs(*args, 6021120, grid=grid(6021120)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/kv/ckv7rs4yhome24evqokglvw3vjls25y3ssixj4dq3f7ptqybjph2.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_766 | |
triton_poi_fused__to_copy_198 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_198(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 540 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((60, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((60, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_198.run(*args, 540, grid=grid(540), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_198.benchmark_all_configs(*args, 540, grid=grid(540)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/zk/czkotusdghmpwd5fnwj7jdl2hmku2xiuhyw3zvizyi423stddlwd.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_76 | |
triton_poi_fused_cat_199 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_199(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 24084480 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 188160 | |
x1 = (xindex // 188160) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (752640*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 60, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_199.run(*args, 24084480, grid=grid(24084480), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_199.benchmark_all_configs(*args, 24084480, grid=grid(24084480)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/yq/cyqk376sl4hj7jr6ig7md3emmwajlxo5ngwitzl6ke74qer4vlzp.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.mul, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_35 | |
# aten.mul => mul_1109 | |
# aten.native_batch_norm_backward => convert_element_type_767, mul_1110, mul_1118, sub_314, sum_146, sum_147 | |
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_200 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[256, 524288], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_200(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 240 | |
rnumel = 401408 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
tmp7 = tl.load(in_ptr3 + (x0), xmask) | |
_tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 3136 | |
r2 = (rindex // 3136) | |
tmp0 = tl.load(in_ptr0 + (r1 + (3136*x0) + (752640*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tl.load(in_ptr1 + (r1 + (3136*x0) + (752640*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp5 = tl.load(in_ptr2 + (r1 + (3136*x0) + (752640*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp2 = tmp0 * tmp1 | |
tmp3 = tmp2.to(tl.float32) | |
_tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4) | |
tmp6 = tmp5.to(tl.float32) | |
tmp8 = tmp6 - tmp7 | |
tmp9 = tmp3 * tmp8 | |
_tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10) | |
tmp4 = tl.sum(_tmp4, 1)[:, None] | |
tl.store(out_ptr0 + x0, tmp4, xmask) | |
tmp10 = tl.sum(_tmp10, 1)[:, None] | |
tl.store(out_ptr1 + x0, tmp10, xmask) | |
tmp11 = tl.load(in_ptr4 + (x0), xmask) | |
tmp12 = tmp10 * tmp11 | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp12, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_200.run(*args, 240, 401408, grid=grid(240), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_200.benchmark_all_configs(*args, 240, 401408, grid=grid(240)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/yh/cyhr4pui7cexlauvjpzunq3nad46e54iw5xeuekil5rzao2n4cb5.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.convolution_backward, aten.mul, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_35 | |
# aten.convolution_backward => convolution_backward_139 | |
# aten.mul => mul_1109 | |
# aten.native_batch_norm_backward => convert_element_type_767, convert_element_type_769, mul_1116, mul_1117, sub_314, sub_316, sub_317 | |
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_201 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[134217728], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_201(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 96337920 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 3136) % 240 | |
tmp0 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp4 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
tmp6 = tl.load(in_ptr2 + (x1), None) | |
tmp8 = tl.load(in_ptr3 + (x1), None) | |
tmp11 = tl.load(in_ptr4 + (x1), None) | |
tmp16 = tl.load(in_ptr5 + (x1), None) | |
tmp19 = tl.load(in_ptr6 + (x1), None) | |
tmp2 = tmp0 * tmp1 | |
tmp3 = tmp2.to(tl.float32) | |
tmp5 = tmp4.to(tl.float32) | |
tmp7 = tmp5 - tmp6 | |
tmp9 = 2.4912308673469386e-06 | |
tmp10 = tmp8 * tmp9 | |
tmp12 = tmp11 * tmp11 | |
tmp13 = tmp10 * tmp12 | |
tmp14 = tmp7 * tmp13 | |
tmp15 = tmp3 - tmp14 | |
tmp17 = tmp16 * tmp9 | |
tmp18 = tmp15 - tmp17 | |
tmp20 = tmp11 * tmp19 | |
tmp21 = tmp18 * tmp20 | |
tmp22 = tmp21.to(tl.float32) | |
tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_201.run(*args, 96337920, grid=grid(96337920), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_201.benchmark_all_configs(*args, 96337920, grid=grid(96337920)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/wr/cwro3jdt4c7rexhki7lqmrop6cuwcmj4fk55z26tcv23lykknwc7.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_770 | |
triton_poi_fused__to_copy_202 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_202(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 9600 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((240, 40, 1, 1), (40, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((240, 40, 1, 1), (40, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_202.run(*args, 9600, grid=grid(9600), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_202.benchmark_all_configs(*args, 9600, grid=grid(9600)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/us/cusjbed3asrxzp3okymq5zoehzdhdnueva2q2mrwfkbmmsa7vt2l.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_32 | |
# aten.native_batch_norm_backward => convert_element_type_771, mul_1119, sub_318, sum_148, sum_149 | |
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_203 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[1024, 32768], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_203(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 520 | |
rnumel = 30878 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x1 = (xindex // 40) | |
x0 = xindex % 40 | |
_tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
_tmp13 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = r2 + (30878*x1) | |
tmp1 = 401408 | |
tmp2 = tmp0 < tmp1 | |
tmp3 = tl.load(in_ptr0 + ((3136*x0) + (125440*(((r2 + (30878*x1)) // 3136) % 128)) + ((r2 + (30878*x1)) % 3136) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp4 = tmp3.to(tl.float32) | |
tmp5 = tl.where(tmp2, tmp4, 0) | |
_tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6) | |
tmp7 = tl.load(in_ptr1 + ((3136*x0) + (125440*(((r2 + (30878*x1)) // 3136) % 128)) + ((r2 + (30878*x1)) % 3136) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp8 = tmp7.to(tl.float32) | |
tmp9 = tl.load(in_ptr2 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0) | |
tmp10 = tmp8 - tmp9 | |
tmp11 = tmp4 * tmp10 | |
tmp12 = tl.where(tmp2, tmp11, 0) | |
_tmp13 = tl.where(rmask & xmask, _tmp13 + tmp12, _tmp13) | |
tmp6 = tl.sum(_tmp6, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp6, xmask) | |
tmp13 = tl.sum(_tmp13, 1)[:, None] | |
tl.store(out_ptr1 + x3, tmp13, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((1, 40, 1, 1), (40, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((40, 13), (1, 40), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((40, 13), (1, 40), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_203.run(*args, 520, 30878, grid=grid(520), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_203.benchmark_all_configs(*args, 520, 30878, grid=grid(520)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/qg/cqg6aa46opip4hczjffhjwy2jut2oaxxzpy4j2v3w3jdazqvqeyy.py | |
# Original ATen: aten.native_batch_norm_backward | |
# aten.native_batch_norm_backward => convert_element_type_771, sum_148 | |
triton_per_fused_native_batch_norm_backward_204 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[64, 16], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused_native_batch_norm_backward_204(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 40 | |
rnumel = 13 | |
RBLOCK: tl.constexpr = 16 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (40*r1)), rmask & xmask, other=0) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tl.store(out_ptr0 + x0, tmp3, xmask) | |
def get_args(): | |
arg_0 = rand_strided((40, 13), (1, 40), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused_native_batch_norm_backward_204.run(*args, 40, 13, grid=grid(40), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused_native_batch_norm_backward_204.benchmark_all_configs(*args, 40, 13, grid=grid(40)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/w5/cw57xxiivmhsirfvmevlwba5lmcxoxn7w2ijdenksrtwpezynqel.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_32 | |
# aten.native_batch_norm_backward => convert_element_type_771, mul_1119, mul_1127, sub_318, sum_149 | |
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_205 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[64, 16], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_205(in_ptr0, in_ptr1, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 40 | |
rnumel = 13 | |
RBLOCK: tl.constexpr = 16 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (40*r1)), rmask & xmask, other=0) | |
tmp4 = tl.load(in_ptr1 + (x0), xmask) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp5 = tmp3 * tmp4 | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask) | |
tl.store(out_ptr0 + x0, tmp3, xmask) | |
def get_args(): | |
arg_0 = rand_strided((40, 13), (1, 40), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_205.run(*args, 40, 13, grid=grid(40), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_205.benchmark_all_configs(*args, 40, 13, grid=grid(40)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/dv/cdvvwqm7hd5zalycntmifgkymej3zfk5pkr7u7cto2iazs4jivjb.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_32 | |
# aten.native_batch_norm_backward => convert_element_type_771, convert_element_type_773, mul_1125, mul_1126, sub_318, sub_320, sub_321 | |
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_206 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp16', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_206(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 16056320 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 3136) % 40 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
tmp4 = tl.load(in_ptr2 + (x1), None) | |
tmp6 = tl.load(in_ptr3 + (x1), None) | |
tmp9 = tl.load(in_ptr4 + (x1), None) | |
tmp14 = tl.load(in_ptr5 + (x1), None) | |
tmp17 = tl.load(in_ptr6 + (x1), None) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp2.to(tl.float32) | |
tmp5 = tmp3 - tmp4 | |
tmp7 = 2.4912308673469386e-06 | |
tmp8 = tmp6 * tmp7 | |
tmp10 = tmp9 * tmp9 | |
tmp11 = tmp8 * tmp10 | |
tmp12 = tmp5 * tmp11 | |
tmp13 = tmp1 - tmp12 | |
tmp15 = tmp14 * tmp7 | |
tmp16 = tmp13 - tmp15 | |
tmp18 = tmp9 * tmp17 | |
tmp19 = tmp16 * tmp18 | |
tmp20 = tmp19.to(tl.float32) | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp20, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((1, 40, 1, 1), (40, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_206.run(*args, 16056320, grid=grid(16056320), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_206.benchmark_all_configs(*args, 16056320, grid=grid(16056320)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/vc/cvcxf7i3sst3omwctckvgjvv4eozcbdv5gjqefaeqscrapf62for.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_774 | |
triton_poi_fused__to_copy_207 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_207(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1200 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((20, 60, 1, 1), (60, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((20, 60, 1, 1), (60, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_207.run(*args, 1200, grid=grid(1200), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_207.benchmark_all_configs(*args, 1200, grid=grid(1200)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/lf/clf4irjcvwl42oxnz5c33v4neup72mhsx7mu2i6zkgdhpnxli2bq.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_77 | |
triton_poi_fused_cat_208 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_208(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 24084480 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 188160 | |
x1 = (xindex // 188160) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (376320*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 60, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_208.run(*args, 24084480, grid=grid(24084480), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_208.benchmark_all_configs(*args, 24084480, grid=grid(24084480)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/66/c66uzpn2lu6yurhfwgr2ruj4qag4lzfc5mizci4jocjyt6zpqdbn.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward, aten.threshold_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_28 | |
# aten.native_batch_norm_backward => convert_element_type_776, mul_1128, sub_322, sum_150, sum_151 | |
# aten.threshold_backward => scalar_tensor, where_1 | |
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_209 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[512, 131072], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*i1', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: 'i32', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_209(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 480 | |
rnumel = 100352 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex % 120 | |
x1 = (xindex // 120) | |
_tmp5 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
tmp8 = tl.load(in_ptr3 + (x0), xmask) | |
_tmp11 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = tl.load(in_ptr0 + ((3136*x0) + (376320*(r2 // 3136)) + (12042240*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last') | |
tmp2 = tl.load(in_ptr1 + ((3136*x0) + (376320*(r2 // 3136)) + (12042240*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp6 = tl.load(in_ptr2 + ((3136*x0) + (376320*(r2 // 3136)) + (12042240*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = 0.0 | |
tmp3 = tl.where(tmp0, tmp1, tmp2) | |
tmp4 = tmp3.to(tl.float32) | |
_tmp5 = tl.where(rmask & xmask, _tmp5 + tmp4, _tmp5) | |
tmp7 = tmp6.to(tl.float32) | |
tmp9 = tmp7 - tmp8 | |
tmp10 = tmp4 * tmp9 | |
_tmp11 = tl.where(rmask & xmask, _tmp11 + tmp10, _tmp11) | |
tmp5 = tl.sum(_tmp5, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp5, xmask) | |
tmp11 = tl.sum(_tmp11, 1)[:, None] | |
tl.store(out_ptr1 + x3, tmp11, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.bool) | |
arg_1 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 120, 1, 1), (120, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((120, 4), (1, 120), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((120, 4), (1, 120), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_209.run(*args, 480, 100352, grid=grid(480), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_209.benchmark_all_configs(*args, 480, 100352, grid=grid(480)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/g3/cg3eh3z2tmue4l5xmjpqccp3tic4aro7pb346j3pv4ncaujbyl3q.py | |
# Original ATen: aten.native_batch_norm_backward, aten.threshold_backward | |
# aten.native_batch_norm_backward => convert_element_type_776, sum_150 | |
# aten.threshold_backward => scalar_tensor, where_1 | |
triton_per_fused_native_batch_norm_backward_threshold_backward_210 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[128, 4], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused_native_batch_norm_backward_threshold_backward_210(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 120 | |
rnumel = 4 | |
RBLOCK: tl.constexpr = 4 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (120*r1)), rmask & xmask, other=0) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tl.store(out_ptr0 + x0, tmp3, xmask) | |
def get_args(): | |
arg_0 = rand_strided((120, 4), (1, 120), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused_native_batch_norm_backward_threshold_backward_210.run(*args, 120, 4, grid=grid(120), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused_native_batch_norm_backward_threshold_backward_210.benchmark_all_configs(*args, 120, 4, grid=grid(120)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/km/ckmhads4vxru2lkra3kkkolwnx3zpb76bwogj7vxbrxuac26qgvv.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward, aten.threshold_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_28 | |
# aten.native_batch_norm_backward => convert_element_type_776, mul_1128, mul_1136, sub_322, sum_151 | |
# aten.threshold_backward => scalar_tensor, where_1 | |
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_211 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[128, 4], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_211(in_ptr0, in_ptr1, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 120 | |
rnumel = 4 | |
RBLOCK: tl.constexpr = 4 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (120*r1)), rmask & xmask, other=0) | |
tmp4 = tl.load(in_ptr1 + (x0), xmask) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp5 = tmp3 * tmp4 | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask) | |
tl.store(out_ptr0 + x0, tmp3, xmask) | |
def get_args(): | |
arg_0 = rand_strided((120, 4), (1, 120), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_211.run(*args, 120, 4, grid=grid(120), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_211.benchmark_all_configs(*args, 120, 4, grid=grid(120)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/xl/cxle66ypftv4la7qrvf43secbqldxij7psgczidhyjvaxrz7kzkz.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.convolution_backward, aten.native_batch_norm_backward, aten.threshold_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_28 | |
# aten.convolution_backward => convolution_backward_142 | |
# aten.native_batch_norm_backward => convert_element_type_776, convert_element_type_778, mul_1134, mul_1135, sub_322, sub_324, sub_325 | |
# aten.threshold_backward => scalar_tensor, where_1 | |
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_native_batch_norm_backward_threshold_backward_212 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*i1', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_native_batch_norm_backward_threshold_backward_212(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 48168960 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 3136) % 120 | |
tmp0 = tl.load(in_ptr0 + (x3), None) | |
tmp2 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32) | |
tmp5 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
tmp7 = tl.load(in_ptr2 + (x1), None) | |
tmp9 = tl.load(in_ptr3 + (x1), None) | |
tmp12 = tl.load(in_ptr4 + (x1), None) | |
tmp17 = tl.load(in_ptr5 + (x1), None) | |
tmp20 = tl.load(in_ptr6 + (x1), None) | |
tmp1 = 0.0 | |
tmp3 = tl.where(tmp0, tmp1, tmp2) | |
tmp4 = tmp3.to(tl.float32) | |
tmp6 = tmp5.to(tl.float32) | |
tmp8 = tmp6 - tmp7 | |
tmp10 = 2.4912308673469386e-06 | |
tmp11 = tmp9 * tmp10 | |
tmp13 = tmp12 * tmp12 | |
tmp14 = tmp11 * tmp13 | |
tmp15 = tmp8 * tmp14 | |
tmp16 = tmp4 - tmp15 | |
tmp18 = tmp17 * tmp10 | |
tmp19 = tmp16 - tmp18 | |
tmp21 = tmp12 * tmp20 | |
tmp22 = tmp19 * tmp21 | |
tmp23 = tmp22.to(tl.float32) | |
tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp23, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.bool) | |
arg_2 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 120, 1, 1), (120, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_native_batch_norm_backward_threshold_backward_212.run(*args, 48168960, grid=grid(48168960), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_native_batch_norm_backward_threshold_backward_212.benchmark_all_configs(*args, 48168960, grid=grid(48168960)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/qz/cqzek4z2thr6mvast3kh2b2tldjmwhpsef2pyz5zmusldwkpkngu.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward, aten.threshold_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_25 | |
# aten.native_batch_norm_backward => convert_element_type_780, mul_1137, sub_326, sum_152, sum_153 | |
# aten.threshold_backward => le_2, scalar_tensor, where_2 | |
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_213 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[512, 131072], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: 'i32', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_213(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 480 | |
rnumel = 100352 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex % 120 | |
x1 = (xindex // 120) | |
_tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
tmp9 = tl.load(in_ptr3 + (x0), xmask) | |
_tmp12 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = tl.load(in_ptr0 + ((3136*x0) + (376320*(r2 // 3136)) + (12042240*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp3 = tl.load(in_ptr1 + ((3136*x0) + (376320*(r2 // 3136)) + (12042240*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp7 = tl.load(in_ptr2 + ((3136*x0) + (376320*(r2 // 3136)) + (12042240*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = 0.0 | |
tmp2 = tmp0 <= tmp1 | |
tmp4 = tl.where(tmp2, tmp1, tmp3) | |
tmp5 = tmp4.to(tl.float32) | |
_tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6) | |
tmp8 = tmp7.to(tl.float32) | |
tmp10 = tmp8 - tmp9 | |
tmp11 = tmp5 * tmp10 | |
_tmp12 = tl.where(rmask & xmask, _tmp12 + tmp11, _tmp12) | |
tmp6 = tl.sum(_tmp6, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp6, xmask) | |
tmp12 = tl.sum(_tmp12, 1)[:, None] | |
tl.store(out_ptr1 + x3, tmp12, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 120, 1, 1), (120, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((120, 4), (1, 120), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((120, 4), (1, 120), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_213.run(*args, 480, 100352, grid=grid(480), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_213.benchmark_all_configs(*args, 480, 100352, grid=grid(480)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/2e/c2enjqp5awzvfk4uybzbydnq3q4m7rptxovs2flh22mf4fsrv3do.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward, aten.threshold_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_25 | |
# aten.native_batch_norm_backward => convert_element_type_780, convert_element_type_782, mul_1143, mul_1144, sub_326, sub_328, sub_329 | |
# aten.threshold_backward => le_2, scalar_tensor, where_2 | |
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_214 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_214(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 48168960 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 3136) % 120 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp3 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32) | |
tmp6 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
tmp8 = tl.load(in_ptr2 + (x1), None) | |
tmp10 = tl.load(in_ptr3 + (x1), None) | |
tmp13 = tl.load(in_ptr4 + (x1), None) | |
tmp18 = tl.load(in_ptr5 + (x1), None) | |
tmp21 = tl.load(in_ptr6 + (x1), None) | |
tmp1 = 0.0 | |
tmp2 = tmp0 <= tmp1 | |
tmp4 = tl.where(tmp2, tmp1, tmp3) | |
tmp5 = tmp4.to(tl.float32) | |
tmp7 = tmp6.to(tl.float32) | |
tmp9 = tmp7 - tmp8 | |
tmp11 = 2.4912308673469386e-06 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp13 * tmp13 | |
tmp15 = tmp12 * tmp14 | |
tmp16 = tmp9 * tmp15 | |
tmp17 = tmp5 - tmp16 | |
tmp19 = tmp18 * tmp11 | |
tmp20 = tmp17 - tmp19 | |
tmp22 = tmp13 * tmp21 | |
tmp23 = tmp20 * tmp22 | |
tmp24 = tmp23.to(tl.float32) | |
tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp24, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 120, 1, 1), (120, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_214.run(*args, 48168960, grid=grid(48168960), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_214.benchmark_all_configs(*args, 48168960, grid=grid(48168960)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/3v/c3v2rhl4qsc7fw7fs2xmnkmc35qejrs64lo33ym5g2mwij4wsjpo.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_783 | |
triton_poi_fused__to_copy_215 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_215(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1200 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((60, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((60, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_215.run(*args, 1200, grid=grid(1200), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_215.benchmark_all_configs(*args, 1200, grid=grid(1200)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/lt/cltkzmq5awdw5gjenomtexkpai5uaptb73szgcptdcapnmaagqxc.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_78 | |
triton_poi_fused_cat_216 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_216(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 8028160 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 62720 | |
x1 = (xindex // 62720) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (125440*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 20, 56, 56), (62720, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 20, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_216.run(*args, 8028160, grid=grid(8028160), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_216.benchmark_all_configs(*args, 8028160, grid=grid(8028160)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/fi/cfi243azbcb4gaimfcgujhtcrh3fg6wgryrmkebaxlazlenz7yvt.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_21 | |
# aten.add => add_380 | |
# aten.native_batch_norm_backward => convert_element_type_785, mul_1146, sub_330, sum_154, sum_155 | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_217 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[1024, 32768], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: 'i32', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_217(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 520 | |
rnumel = 30878 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x1 = (xindex // 40) | |
x0 = xindex % 40 | |
_tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = r2 + (30878*x1) | |
tmp1 = 401408 | |
tmp2 = tmp0 < tmp1 | |
tmp3 = tl.load(in_ptr0 + ((3136*x0) + (125440*(((r2 + (30878*x1)) // 3136) % 128)) + ((r2 + (30878*x1)) % 3136) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp4 = tl.load(in_ptr1 + ((3136*x0) + (125440*(((r2 + (30878*x1)) // 3136) % 128)) + ((r2 + (30878*x1)) % 3136) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp5 = tmp3 + tmp4 | |
tmp6 = tmp5.to(tl.float32) | |
tmp7 = tl.where(tmp2, tmp6, 0) | |
_tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8) | |
tmp9 = tl.load(in_ptr2 + ((3136*x0) + (125440*(((r2 + (30878*x1)) // 3136) % 128)) + ((r2 + (30878*x1)) % 3136) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp10 = tmp9.to(tl.float32) | |
tmp11 = tl.load(in_ptr3 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0) | |
tmp12 = tmp10 - tmp11 | |
tmp13 = tmp6 * tmp12 | |
tmp14 = tl.where(tmp2, tmp13, 0) | |
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15) | |
tmp8 = tl.sum(_tmp8, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp8, xmask) | |
tmp15 = tl.sum(_tmp15, 1)[:, None] | |
tl.store(out_ptr1 + x3, tmp15, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 40, 1, 1), (40, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((40, 13), (1, 40), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((40, 13), (1, 40), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_217.run(*args, 520, 30878, grid=grid(520), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_217.benchmark_all_configs(*args, 520, 30878, grid=grid(520)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/lp/clp5ea4udz42bfruecjqk4ujvitf7447q7o62utqlt7g7q4jrw2p.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_21 | |
# aten.add => add_380 | |
# aten.native_batch_norm_backward => convert_element_type_785, convert_element_type_787, mul_1152, mul_1153, sub_330, sub_332, sub_333 | |
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_218 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_218(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 16056320 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 3136) % 40 | |
tmp0 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp4 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
tmp6 = tl.load(in_ptr2 + (x1), None) | |
tmp8 = tl.load(in_ptr3 + (x1), None) | |
tmp11 = tl.load(in_ptr4 + (x1), None) | |
tmp16 = tl.load(in_ptr5 + (x1), None) | |
tmp19 = tl.load(in_ptr6 + (x1), None) | |
tmp2 = tmp0 + tmp1 | |
tmp3 = tmp2.to(tl.float32) | |
tmp5 = tmp4.to(tl.float32) | |
tmp7 = tmp5 - tmp6 | |
tmp9 = 2.4912308673469386e-06 | |
tmp10 = tmp8 * tmp9 | |
tmp12 = tmp11 * tmp11 | |
tmp13 = tmp10 * tmp12 | |
tmp14 = tmp7 * tmp13 | |
tmp15 = tmp3 - tmp14 | |
tmp17 = tmp16 * tmp9 | |
tmp18 = tmp15 - tmp17 | |
tmp20 = tmp11 * tmp19 | |
tmp21 = tmp18 * tmp20 | |
tmp22 = tmp21.to(tl.float32) | |
tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 40, 1, 1), (40, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_218.run(*args, 16056320, grid=grid(16056320), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_218.benchmark_all_configs(*args, 16056320, grid=grid(16056320)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ub/cubon5pg2aboclxsgfu5ogwsvehrz3yt2ht3ofzn74nj37pfe23y.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_788 | |
triton_poi_fused__to_copy_219 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_219(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1920 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((20, 96, 1, 1), (96, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((20, 96, 1, 1), (96, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_219.run(*args, 1920, grid=grid(1920), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_219.benchmark_all_configs(*args, 1920, grid=grid(1920)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/k5/ck52swdvfg2q2o33v23iuvlzxjnegschzojpfprsk4betmydhemy.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_79 | |
triton_poi_fused_cat_220 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_220(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 38535168 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 301056 | |
x1 = (xindex // 301056) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (602112*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 96, 56, 56), (301056, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 96, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_220.run(*args, 38535168, grid=grid(38535168), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_220.benchmark_all_configs(*args, 38535168, grid=grid(38535168)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/sx/csxrwjlnsstdp2sod6mdyrq6fqpsozmjhqryoplixuev7a4zfvvs.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward, aten.threshold_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_17 | |
# aten.native_batch_norm_backward => convert_element_type_790, mul_1155, sub_334, sum_156, sum_157 | |
# aten.threshold_backward => scalar_tensor, where_3 | |
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_221 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[1024, 131072], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*i1', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: 'i32', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_221(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 768 | |
rnumel = 100352 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex % 192 | |
x1 = (xindex // 192) | |
_tmp5 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
tmp8 = tl.load(in_ptr3 + (x0), xmask) | |
_tmp11 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = tl.load(in_ptr0 + ((3136*x0) + (602112*(r2 // 3136)) + (19267584*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last') | |
tmp2 = tl.load(in_ptr1 + ((3136*x0) + (602112*(r2 // 3136)) + (19267584*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp6 = tl.load(in_ptr2 + ((3136*x0) + (602112*(r2 // 3136)) + (19267584*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = 0.0 | |
tmp3 = tl.where(tmp0, tmp1, tmp2) | |
tmp4 = tmp3.to(tl.float32) | |
_tmp5 = tl.where(rmask & xmask, _tmp5 + tmp4, _tmp5) | |
tmp7 = tmp6.to(tl.float32) | |
tmp9 = tmp7 - tmp8 | |
tmp10 = tmp4 * tmp9 | |
_tmp11 = tl.where(rmask & xmask, _tmp11 + tmp10, _tmp11) | |
tmp5 = tl.sum(_tmp5, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp5, xmask) | |
tmp11 = tl.sum(_tmp11, 1)[:, None] | |
tl.store(out_ptr1 + x3, tmp11, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.bool) | |
arg_1 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 192, 1, 1), (192, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((192, 4), (1, 192), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((192, 4), (1, 192), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_221.run(*args, 768, 100352, grid=grid(768), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_221.benchmark_all_configs(*args, 768, 100352, grid=grid(768)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/cd/ccdek2lkvoxnmb7pygnnwx2hocehe2swy5lruknc256ae3z7ejqs.py | |
# Original ATen: aten.native_batch_norm_backward, aten.threshold_backward | |
# aten.native_batch_norm_backward => convert_element_type_790, sum_156 | |
# aten.threshold_backward => scalar_tensor, where_3 | |
triton_per_fused_native_batch_norm_backward_threshold_backward_222 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[256, 4], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused_native_batch_norm_backward_threshold_backward_222(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 192 | |
rnumel = 4 | |
RBLOCK: tl.constexpr = 4 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (192*r1)), rmask & xmask, other=0) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tl.store(out_ptr0 + x0, tmp3, xmask) | |
def get_args(): | |
arg_0 = rand_strided((192, 4), (1, 192), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused_native_batch_norm_backward_threshold_backward_222.run(*args, 192, 4, grid=grid(192), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused_native_batch_norm_backward_threshold_backward_222.benchmark_all_configs(*args, 192, 4, grid=grid(192)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/yq/cyqwa3yzhcmbcz6impuv4jmjxs3gz6jyzv6afsu3mcgdp62xtf44.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward, aten.threshold_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_17 | |
# aten.native_batch_norm_backward => convert_element_type_790, mul_1155, mul_1163, sub_334, sum_157 | |
# aten.threshold_backward => scalar_tensor, where_3 | |
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_223 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[256, 4], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_223(in_ptr0, in_ptr1, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 192 | |
rnumel = 4 | |
RBLOCK: tl.constexpr = 4 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (192*r1)), rmask & xmask, other=0) | |
tmp4 = tl.load(in_ptr1 + (x0), xmask) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp5 = tmp3 * tmp4 | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask) | |
tl.store(out_ptr0 + x0, tmp3, xmask) | |
def get_args(): | |
arg_0 = rand_strided((192, 4), (1, 192), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_223.run(*args, 192, 4, grid=grid(192), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_223.benchmark_all_configs(*args, 192, 4, grid=grid(192)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/tc/ctc3cbkradccw4562bnllxmu4ahgq5euwmbvdvguaplxo7grkqr2.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward, aten.threshold_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_17 | |
# aten.native_batch_norm_backward => convert_element_type_790, convert_element_type_792, mul_1161, mul_1162, sub_334, sub_336, sub_337 | |
# aten.threshold_backward => scalar_tensor, where_3 | |
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_224 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[134217728], filename=__file__, meta={'signature': {0: '*fp16', 1: '*i1', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_224(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 77070336 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 3136) % 192 | |
tmp0 = tl.load(in_ptr0 + (x3), None) | |
tmp2 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32) | |
tmp5 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
tmp7 = tl.load(in_ptr2 + (x1), None) | |
tmp9 = tl.load(in_ptr3 + (x1), None) | |
tmp12 = tl.load(in_ptr4 + (x1), None) | |
tmp17 = tl.load(in_ptr5 + (x1), None) | |
tmp20 = tl.load(in_ptr6 + (x1), None) | |
tmp1 = 0.0 | |
tmp3 = tl.where(tmp0, tmp1, tmp2) | |
tmp4 = tmp3.to(tl.float32) | |
tmp6 = tmp5.to(tl.float32) | |
tmp8 = tmp6 - tmp7 | |
tmp10 = 2.4912308673469386e-06 | |
tmp11 = tmp9 * tmp10 | |
tmp13 = tmp12 * tmp12 | |
tmp14 = tmp11 * tmp13 | |
tmp15 = tmp8 * tmp14 | |
tmp16 = tmp4 - tmp15 | |
tmp18 = tmp17 * tmp10 | |
tmp19 = tmp16 - tmp18 | |
tmp21 = tmp12 * tmp20 | |
tmp22 = tmp19 * tmp21 | |
tmp23 = tmp22.to(tl.float32) | |
tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp23, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.bool) | |
arg_2 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 192, 1, 1), (192, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_224.run(*args, 77070336, grid=grid(77070336), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_224.benchmark_all_configs(*args, 77070336, grid=grid(77070336)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ow/cowvpe4d4lhlyuz7t3xpn4aeadwdi4sc2kye37det3aqbobyu2ax.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_793 | |
triton_poi_fused__to_copy_225 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_225(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3136 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((64, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((64, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_225.run(*args, 3136, grid=grid(3136), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_225.benchmark_all_configs(*args, 3136, grid=grid(3136)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/sk/cskbzrlfrwse6dagb7nizhjaccr2s25mcslu277prit4fw5smhxb.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_794 | |
triton_poi_fused__to_copy_226 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_226(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1600 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((64, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((64, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_226.run(*args, 1600, grid=grid(1600), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_226.benchmark_all_configs(*args, 1600, grid=grid(1600)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/i3/ci3rhipt5njk6wniwbb724hbiv53ph2lumy42nnfncwqxny7w55l.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_795 | |
triton_poi_fused__to_copy_227 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_227(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 576 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((64, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((64, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_227.run(*args, 576, grid=grid(576), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_227.benchmark_all_configs(*args, 576, grid=grid(576)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/pk/cpkbt4sa6m63phgjx2padg7pol23qjiy476umuxagu2pnouljl7g.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_80 | |
triton_poi_fused_cat_228 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[134217728], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_228(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 102760448 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 802816 | |
x1 = (xindex // 802816) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (2408448*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 64, 112, 112), (802816, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 64, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_228.run(*args, 102760448, grid=grid(102760448), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_228.benchmark_all_configs(*args, 102760448, grid=grid(102760448)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/iq/ciqjlz62phm2ticzaasbypqp5l7x2nk7uwgl64uxeslxjnvr5rhx.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward, aten.threshold_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_12 | |
# aten.native_batch_norm_backward => convert_element_type_796, mul_1164, sub_338, sum_158, sum_159 | |
# aten.threshold_backward => scalar_tensor, where_4 | |
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_229 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[4096, 131072], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*i1', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: 'i32', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_229(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 2496 | |
rnumel = 123511 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x1 = (xindex // 192) | |
x0 = xindex % 192 | |
_tmp9 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
_tmp16 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = r2 + (123511*x1) | |
tmp1 = 1605632 | |
tmp2 = tmp0 < tmp1 | |
tmp3 = tl.load(in_ptr0 + ((12544*x0) + (2408448*(((r2 + (123511*x1)) // 12544) % 128)) + ((r2 + (123511*x1)) % 12544) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last') | |
tmp4 = 0.0 | |
tmp5 = tl.load(in_ptr1 + ((12544*x0) + (2408448*(((r2 + (123511*x1)) // 12544) % 128)) + ((r2 + (123511*x1)) % 12544) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp6 = tl.where(tmp3, tmp4, tmp5) | |
tmp7 = tmp6.to(tl.float32) | |
tmp8 = tl.where(tmp2, tmp7, 0) | |
_tmp9 = tl.where(rmask & xmask, _tmp9 + tmp8, _tmp9) | |
tmp10 = tl.load(in_ptr2 + ((12544*x0) + (2408448*(((r2 + (123511*x1)) // 12544) % 128)) + ((r2 + (123511*x1)) % 12544) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp11 = tmp10.to(tl.float32) | |
tmp12 = tl.load(in_ptr3 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0) | |
tmp13 = tmp11 - tmp12 | |
tmp14 = tmp7 * tmp13 | |
tmp15 = tl.where(tmp2, tmp14, 0) | |
_tmp16 = tl.where(rmask & xmask, _tmp16 + tmp15, _tmp16) | |
tmp9 = tl.sum(_tmp9, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp9, xmask) | |
tmp16 = tl.sum(_tmp16, 1)[:, None] | |
tl.store(out_ptr1 + x3, tmp16, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.bool) | |
arg_1 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 192, 1, 1), (192, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((192, 13), (1, 192), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((192, 13), (1, 192), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_229.run(*args, 2496, 123511, grid=grid(2496), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_229.benchmark_all_configs(*args, 2496, 123511, grid=grid(2496)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ys/cysnmvaph32l2fx4uq56auk76xzeogfqlpd5wl5alqcd32enjld7.py | |
# Original ATen: aten.native_batch_norm_backward, aten.threshold_backward | |
# aten.native_batch_norm_backward => convert_element_type_796, sum_158 | |
# aten.threshold_backward => scalar_tensor, where_4 | |
triton_per_fused_native_batch_norm_backward_threshold_backward_230 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[256, 16], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused_native_batch_norm_backward_threshold_backward_230(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 192 | |
rnumel = 13 | |
RBLOCK: tl.constexpr = 16 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (192*r1)), rmask & xmask, other=0) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tl.store(out_ptr0 + x0, tmp3, xmask) | |
def get_args(): | |
arg_0 = rand_strided((192, 13), (1, 192), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused_native_batch_norm_backward_threshold_backward_230.run(*args, 192, 13, grid=grid(192), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused_native_batch_norm_backward_threshold_backward_230.benchmark_all_configs(*args, 192, 13, grid=grid(192)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/tq/ctqqxyxucpkck2pbfdk6hiij3w3ouutzihnufujtspvbizzcvokl.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward, aten.threshold_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_12 | |
# aten.native_batch_norm_backward => convert_element_type_796, mul_1164, mul_1172, sub_338, sum_159 | |
# aten.threshold_backward => scalar_tensor, where_4 | |
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_231 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[256, 16], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_231(in_ptr0, in_ptr1, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 192 | |
rnumel = 13 | |
RBLOCK: tl.constexpr = 16 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (192*r1)), rmask & xmask, other=0) | |
tmp4 = tl.load(in_ptr1 + (x0), xmask) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp5 = tmp3 * tmp4 | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask) | |
tl.store(out_ptr0 + x0, tmp3, xmask) | |
def get_args(): | |
arg_0 = rand_strided((192, 13), (1, 192), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_231.run(*args, 192, 13, grid=grid(192), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_231.benchmark_all_configs(*args, 192, 13, grid=grid(192)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/2c/c2cj7sokvc2ucpcatjevxoabxae6fcr5uq7unf2sn2ww5auw7tct.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward, aten.threshold_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_12 | |
# aten.native_batch_norm_backward => convert_element_type_796, convert_element_type_798, mul_1170, mul_1171, sub_338, sub_340, sub_341 | |
# aten.threshold_backward => scalar_tensor, where_4 | |
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_232 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[536870912], filename=__file__, meta={'signature': {0: '*fp16', 1: '*i1', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_232(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 308281344 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 12544) % 192 | |
tmp0 = tl.load(in_ptr0 + (x3), None) | |
tmp2 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32) | |
tmp5 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
tmp7 = tl.load(in_ptr2 + (x1), None) | |
tmp9 = tl.load(in_ptr3 + (x1), None) | |
tmp12 = tl.load(in_ptr4 + (x1), None) | |
tmp17 = tl.load(in_ptr5 + (x1), None) | |
tmp20 = tl.load(in_ptr6 + (x1), None) | |
tmp1 = 0.0 | |
tmp3 = tl.where(tmp0, tmp1, tmp2) | |
tmp4 = tmp3.to(tl.float32) | |
tmp6 = tmp5.to(tl.float32) | |
tmp8 = tmp6 - tmp7 | |
tmp10 = 6.228077168367346e-07 | |
tmp11 = tmp9 * tmp10 | |
tmp13 = tmp12 * tmp12 | |
tmp14 = tmp11 * tmp13 | |
tmp15 = tmp8 * tmp14 | |
tmp16 = tmp4 - tmp15 | |
tmp18 = tmp17 * tmp10 | |
tmp19 = tmp16 - tmp18 | |
tmp21 = tmp12 * tmp20 | |
tmp22 = tmp19 * tmp21 | |
tmp23 = tmp22.to(tl.float32) | |
tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp23, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.bool) | |
arg_2 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 192, 1, 1), (192, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_232.run(*args, 308281344, grid=grid(308281344), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_232.benchmark_all_configs(*args, 308281344, grid=grid(308281344)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ug/cugvnwu4vijgcqk6aeukdnlyjpwwrweo5ekg2rzkt3wc2oyg6si6.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_799 | |
triton_poi_fused__to_copy_233 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_233(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1536 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((96, 16, 1, 1), (16, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((96, 16, 1, 1), (16, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_233.run(*args, 1536, grid=grid(1536), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_233.benchmark_all_configs(*args, 1536, grid=grid(1536)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/sq/csq2s3gta567xaj4joh64czv7oue4eaxrvun3o6onmtg5rojpoec.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_81 | |
triton_poi_fused_cat_234 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_234(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 25690112 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 200704 | |
x1 = (xindex // 200704) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (401408*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 16, 112, 112), (200704, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 16, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_234.run(*args, 25690112, grid=grid(25690112), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_234.benchmark_all_configs(*args, 25690112, grid=grid(25690112)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/js/cjsqfjzxqv7rsfcedmto3ifg4qqtw6degpzalrk6qlwijztosgeh.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_8 | |
# aten.native_batch_norm_backward => convert_element_type_801, mul_1173, sub_342, sum_160, sum_161 | |
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_235 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[512, 131072], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_235(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 448 | |
rnumel = 114688 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex % 32 | |
x1 = (xindex // 32) | |
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
tmp5 = tl.load(in_ptr2 + (x0), xmask) | |
_tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = tl.load(in_ptr0 + ((112*(((r2 + (114688*x1)) // 112) % 112)) + (12544*x0) + (401408*((r2 + (114688*x1)) // 12544)) + (r2 % 112)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp3 = tl.load(in_ptr1 + ((112*(((r2 + (114688*x1)) // 112) % 112)) + (12544*x0) + (401408*((r2 + (114688*x1)) // 12544)) + (r2 % 112)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
tmp4 = tmp3.to(tl.float32) | |
tmp6 = tmp4 - tmp5 | |
tmp7 = tmp1 * tmp6 | |
_tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8) | |
tmp2 = tl.sum(_tmp2, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp2, xmask) | |
tmp8 = tl.sum(_tmp8, 1)[:, None] | |
tl.store(out_ptr1 + x3, tmp8, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((1, 32, 1, 1), (32, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((32, 14), (1, 32), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((32, 14), (1, 32), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_235.run(*args, 448, 114688, grid=grid(448), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_235.benchmark_all_configs(*args, 448, 114688, grid=grid(448)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/un/cunmdmnzzky4hpfyncovvbl6z7364avkwmbstijt5sdqy6c3sndo.py | |
# Original ATen: aten.native_batch_norm_backward | |
# aten.native_batch_norm_backward => convert_element_type_801, sum_160 | |
triton_per_fused_native_batch_norm_backward_236 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[32, 16], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused_native_batch_norm_backward_236(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 32 | |
rnumel = 14 | |
RBLOCK: tl.constexpr = 16 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (32*r1)), rmask & xmask, other=0) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tl.store(out_ptr0 + x0, tmp3, xmask) | |
def get_args(): | |
arg_0 = rand_strided((32, 14), (1, 32), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused_native_batch_norm_backward_236.run(*args, 32, 14, grid=grid(32), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused_native_batch_norm_backward_236.benchmark_all_configs(*args, 32, 14, grid=grid(32)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/wf/cwf5g4pysq3gqzxlojvd577tupirripwoebyow33p2p7ylbn53lm.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_8 | |
# aten.native_batch_norm_backward => convert_element_type_801, mul_1173, mul_1181, sub_342, sum_161 | |
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_237 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[32, 16], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_237(in_ptr0, in_ptr1, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 32 | |
rnumel = 14 | |
RBLOCK: tl.constexpr = 16 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (32*r1)), rmask & xmask, other=0) | |
tmp4 = tl.load(in_ptr1 + (x0), xmask) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp5 = tmp3 * tmp4 | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask) | |
tl.store(out_ptr0 + x0, tmp3, xmask) | |
def get_args(): | |
arg_0 = rand_strided((32, 14), (1, 32), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_237.run(*args, 32, 14, grid=grid(32), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_237.benchmark_all_configs(*args, 32, 14, grid=grid(32)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/6m/c6msee73vpt32tetzwls5fkhzvwfsszmuikkahpidywnybj6hbvg.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.convolution_backward, aten.native_batch_norm_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_8 | |
# aten.convolution_backward => convolution_backward_152 | |
# aten.native_batch_norm_backward => convert_element_type_801, convert_element_type_803, mul_1179, mul_1180, sub_342, sub_344, sub_345 | |
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_native_batch_norm_backward_238 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp16', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_native_batch_norm_backward_238(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 51380224 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 12544) % 32 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
tmp4 = tl.load(in_ptr2 + (x1), None) | |
tmp6 = tl.load(in_ptr3 + (x1), None) | |
tmp9 = tl.load(in_ptr4 + (x1), None) | |
tmp14 = tl.load(in_ptr5 + (x1), None) | |
tmp17 = tl.load(in_ptr6 + (x1), None) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp2.to(tl.float32) | |
tmp5 = tmp3 - tmp4 | |
tmp7 = 6.228077168367346e-07 | |
tmp8 = tmp6 * tmp7 | |
tmp10 = tmp9 * tmp9 | |
tmp11 = tmp8 * tmp10 | |
tmp12 = tmp5 * tmp11 | |
tmp13 = tmp1 - tmp12 | |
tmp15 = tmp14 * tmp7 | |
tmp16 = tmp13 - tmp15 | |
tmp18 = tmp9 * tmp17 | |
tmp19 = tmp16 * tmp18 | |
tmp20 = tmp19.to(tl.float32) | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp20, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((1, 32, 1, 1), (32, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_native_batch_norm_backward_238.run(*args, 51380224, grid=grid(51380224), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_native_batch_norm_backward_238.benchmark_all_configs(*args, 51380224, grid=grid(51380224)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/7k/c7krxyxxql7bxontztrlvaggrh6mqywuks2m4idv23yuou4gz2wy.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_804 | |
triton_poi_fused__to_copy_239 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_239(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1024 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((32, 32, 1, 1), (32, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((32, 32, 1, 1), (32, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_239.run(*args, 1024, grid=grid(1024), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_239.benchmark_all_configs(*args, 1024, grid=grid(1024)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/7e/c7ep5i7oybdvegdm7cwjtpx47vlwexnhlgzvv4hni2dhifla6ilf.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward, aten.threshold_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_5 | |
# aten.native_batch_norm_backward => convert_element_type_805, mul_1182, sub_346, sum_162, sum_163 | |
# aten.threshold_backward => le_5, scalar_tensor, where_5 | |
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_240 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[512, 131072], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: 'i32', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_240(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 448 | |
rnumel = 114688 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex % 32 | |
x1 = (xindex // 32) | |
_tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
tmp9 = tl.load(in_ptr3 + (x0), xmask) | |
_tmp12 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = tl.load(in_ptr0 + ((112*(((r2 + (114688*x1)) // 112) % 112)) + (12544*x0) + (401408*((r2 + (114688*x1)) // 12544)) + (r2 % 112)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp3 = tl.load(in_ptr1 + ((112*(((r2 + (114688*x1)) // 112) % 112)) + (12544*x0) + (401408*((r2 + (114688*x1)) // 12544)) + (r2 % 112)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp7 = tl.load(in_ptr2 + ((112*(((r2 + (114688*x1)) // 112) % 112)) + (12544*x0) + (401408*((r2 + (114688*x1)) // 12544)) + (r2 % 112)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = 0.0 | |
tmp2 = tmp0 <= tmp1 | |
tmp4 = tl.where(tmp2, tmp1, tmp3) | |
tmp5 = tmp4.to(tl.float32) | |
_tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6) | |
tmp8 = tmp7.to(tl.float32) | |
tmp10 = tmp8 - tmp9 | |
tmp11 = tmp5 * tmp10 | |
_tmp12 = tl.where(rmask & xmask, _tmp12 + tmp11, _tmp12) | |
tmp6 = tl.sum(_tmp6, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp6, xmask) | |
tmp12 = tl.sum(_tmp12, 1)[:, None] | |
tl.store(out_ptr1 + x3, tmp12, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 32, 1, 1), (32, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((32, 14), (1, 32), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((32, 14), (1, 32), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_240.run(*args, 448, 114688, grid=grid(448), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_240.benchmark_all_configs(*args, 448, 114688, grid=grid(448)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/xb/cxbvbwtgotbwspd422aktzvto6a6qirjkhi57yyazyi4ebmvxve3.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.convolution_backward, aten.native_batch_norm_backward, aten.threshold_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_5 | |
# aten.convolution_backward => convolution_backward_153 | |
# aten.native_batch_norm_backward => convert_element_type_805, convert_element_type_807, mul_1188, mul_1189, sub_346, sub_348, sub_349 | |
# aten.threshold_backward => le_5, scalar_tensor, where_5 | |
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_native_batch_norm_backward_threshold_backward_241 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_native_batch_norm_backward_threshold_backward_241(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 51380224 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 12544) % 32 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp3 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32) | |
tmp6 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
tmp8 = tl.load(in_ptr2 + (x1), None) | |
tmp10 = tl.load(in_ptr3 + (x1), None) | |
tmp13 = tl.load(in_ptr4 + (x1), None) | |
tmp18 = tl.load(in_ptr5 + (x1), None) | |
tmp21 = tl.load(in_ptr6 + (x1), None) | |
tmp1 = 0.0 | |
tmp2 = tmp0 <= tmp1 | |
tmp4 = tl.where(tmp2, tmp1, tmp3) | |
tmp5 = tmp4.to(tl.float32) | |
tmp7 = tmp6.to(tl.float32) | |
tmp9 = tmp7 - tmp8 | |
tmp11 = 6.228077168367346e-07 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp13 * tmp13 | |
tmp15 = tmp12 * tmp14 | |
tmp16 = tmp9 * tmp15 | |
tmp17 = tmp5 - tmp16 | |
tmp19 = tmp18 * tmp11 | |
tmp20 = tmp17 - tmp19 | |
tmp22 = tmp13 * tmp21 | |
tmp23 = tmp20 * tmp22 | |
tmp24 = tmp23.to(tl.float32) | |
tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp24, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((1, 32, 1, 1), (32, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_native_batch_norm_backward_threshold_backward_241.run(*args, 51380224, grid=grid(51380224), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_native_batch_norm_backward_threshold_backward_241.benchmark_all_configs(*args, 51380224, grid=grid(51380224)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/oj/cojqompm7misslrpvtl7ybsy7rsgvmn4juo7mjxlwkes4u6yrrna.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_808 | |
triton_poi_fused__to_copy_242 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[512], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_242(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 288 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((32, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((32, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_242.run(*args, 288, grid=grid(288), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_242.benchmark_all_configs(*args, 288, grid=grid(288)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/w4/cw4gnxluz6sduqkpdtk54qml7vehohzccmsuenbbqypvp5lck7fy.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward, aten.threshold_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_2 | |
# aten.add => add_381 | |
# aten.native_batch_norm_backward => convert_element_type_809, mul_1191, sub_350, sum_164, sum_165 | |
# aten.threshold_backward => le_6, scalar_tensor, where_6 | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_threshold_backward_243 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[512, 131072], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_threshold_backward_243(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 448 | |
rnumel = 114688 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex % 32 | |
x1 = (xindex // 32) | |
_tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
tmp11 = tl.load(in_ptr4 + (x0), xmask) | |
_tmp14 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = tl.load(in_ptr0 + ((112*(((r2 + (114688*x1)) // 112) % 112)) + (12544*x0) + (401408*((r2 + (114688*x1)) // 12544)) + (r2 % 112)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp3 = tl.load(in_ptr1 + ((112*(((r2 + (114688*x1)) // 112) % 112)) + (12544*x0) + (401408*((r2 + (114688*x1)) // 12544)) + (r2 % 112)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp4 = tl.load(in_ptr2 + ((112*(((r2 + (114688*x1)) // 112) % 112)) + (12544*x0) + (401408*((r2 + (114688*x1)) // 12544)) + (r2 % 112)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp9 = tl.load(in_ptr3 + ((112*(((r2 + (114688*x1)) // 112) % 112)) + (12544*x0) + (401408*((r2 + (114688*x1)) // 12544)) + (r2 % 112)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = 0.0 | |
tmp2 = tmp0 <= tmp1 | |
tmp5 = tmp3 + tmp4 | |
tmp6 = tl.where(tmp2, tmp1, tmp5) | |
tmp7 = tmp6.to(tl.float32) | |
_tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8) | |
tmp10 = tmp9.to(tl.float32) | |
tmp12 = tmp10 - tmp11 | |
tmp13 = tmp7 * tmp12 | |
_tmp14 = tl.where(rmask & xmask, _tmp14 + tmp13, _tmp14) | |
tmp8 = tl.sum(_tmp8, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp8, xmask) | |
tmp14 = tl.sum(_tmp14, 1)[:, None] | |
tl.store(out_ptr1 + x3, tmp14, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((1, 32, 1, 1), (32, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((32, 14), (1, 32), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((32, 14), (1, 32), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_threshold_backward_243.run(*args, 448, 114688, grid=grid(448), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_threshold_backward_243.benchmark_all_configs(*args, 448, 114688, grid=grid(448)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ob/cobmv47m4hftiufftthbknhbqe67o3mathld7aplwt4zh3s3vn2t.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.convolution_backward, aten.native_batch_norm_backward, aten.threshold_backward | |
# aten._native_batch_norm_legit_functional => convert_element_type_2 | |
# aten.add => add_381 | |
# aten.convolution_backward => convolution_backward_154 | |
# aten.native_batch_norm_backward => convert_element_type_809, convert_element_type_811, mul_1197, mul_1198, sub_350, sub_352, sub_353 | |
# aten.threshold_backward => le_6, scalar_tensor, where_6 | |
triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_threshold_backward_244 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp16', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_threshold_backward_244(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 51380224 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 12544) % 32 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp3 = tl.load(in_ptr1 + (x3), None).to(tl.float32) | |
tmp4 = tl.load(in_ptr2 + (x3), None).to(tl.float32) | |
tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32) | |
tmp10 = tl.load(in_ptr4 + (x1), None) | |
tmp12 = tl.load(in_ptr5 + (x1), None) | |
tmp15 = tl.load(in_ptr6 + (x1), None) | |
tmp20 = tl.load(in_ptr7 + (x1), None) | |
tmp23 = tl.load(in_ptr8 + (x1), None) | |
tmp1 = 0.0 | |
tmp2 = tmp0 <= tmp1 | |
tmp5 = tmp3 + tmp4 | |
tmp6 = tl.where(tmp2, tmp1, tmp5) | |
tmp7 = tmp6.to(tl.float32) | |
tmp9 = tmp8.to(tl.float32) | |
tmp11 = tmp9 - tmp10 | |
tmp13 = 6.228077168367346e-07 | |
tmp14 = tmp12 * tmp13 | |
tmp16 = tmp15 * tmp15 | |
tmp17 = tmp14 * tmp16 | |
tmp18 = tmp11 * tmp17 | |
tmp19 = tmp7 - tmp18 | |
tmp21 = tmp20 * tmp13 | |
tmp22 = tmp19 - tmp21 | |
tmp24 = tmp15 * tmp23 | |
tmp25 = tmp22 * tmp24 | |
tmp26 = tmp25.to(tl.float32) | |
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp26, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_3 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_4 = rand_strided((1, 32, 1, 1), (32, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_8 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_9 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_threshold_backward_244.run(*args, 51380224, grid=grid(51380224), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_threshold_backward_244.benchmark_all_configs(*args, 51380224, grid=grid(51380224)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/gd/cgdntdeldc5fbhs5emwmc54vd4hg2eeljdsb6fcby2wncgyzpb5r.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_812 | |
triton_poi_fused__to_copy_245 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_245(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 864 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((32, 3, 3, 3), (27, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((32, 3, 3, 3), (27, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_245.run(*args, 864, grid=grid(864), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_245.benchmark_all_configs(*args, 864, grid=grid(864)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
async_compile.wait(globals()) | |
del async_compile | |
def call(args): | |
primals_1, primals_3, primals_5, primals_7, primals_9, primals_11, primals_13, primals_15, primals_17, primals_19, primals_21, primals_23, primals_25, primals_27, primals_29, primals_31, primals_33, primals_35, primals_37, primals_39, primals_41, primals_43, primals_45, primals_47, primals_49, primals_51, primals_53, primals_55, primals_57, primals_59, primals_61, primals_63, primals_65, primals_67, primals_69, primals_71, primals_73, primals_75, primals_77, primals_79, primals_81, primals_83, primals_85, primals_87, primals_89, primals_91, primals_93, primals_95, primals_97, primals_99, primals_101, primals_103, primals_105, primals_107, primals_109, primals_111, primals_113, primals_115, convert_element_type, convert_element_type_1, convolution, squeeze_1, relu, convert_element_type_4, convolution_1, squeeze_4, relu_1, convert_element_type_7, convolution_2, squeeze_7, getitem_6, getitem_7, convert_element_type_10, convert_element_type_11, cat, squeeze_10, convert_element_type_14, getitem_13, convert_element_type_15, getitem_17, convert_element_type_16, getitem_21, cat_1, squeeze_13, convert_element_type_19, getitem_26, convert_element_type_20, getitem_29, cat_2, squeeze_16, getitem_32, getitem_33, convert_element_type_23, convert_element_type_24, cat_3, squeeze_19, relu_4, convert_element_type_27, convolution_12, squeeze_22, convert_element_type_30, getitem_40, convert_element_type_31, getitem_43, cat_4, squeeze_25, add_46, convert_element_type_34, convolution_15, squeeze_28, convert_element_type_39, getitem_52, convert_element_type_40, getitem_57, convert_element_type_41, getitem_62, convert_element_type_42, getitem_67, cat_5, squeeze_31, convert_element_type_44, mean, convert_element_type_48, convolution_20, convert_element_type_50, convert_element_type_52, convolution_21, mul_80, convert_element_type_53, convolution_22, squeeze_34, getitem_72, getitem_73, convert_element_type_56, convert_element_type_57, cat_6, squeeze_37, convert_element_type_62, getitem_78, convert_element_type_63, getitem_81, cat_7, squeeze_40, convert_element_type_65, mean_1, convert_element_type_69, convolution_27, convert_element_type_71, convert_element_type_73, convolution_28, getitem_84, getitem_85, convert_element_type_74, convert_element_type_75, cat_8, squeeze_43, getitem_88, getitem_89, convert_element_type_78, convert_element_type_79, cat_9, squeeze_46, convert_element_type_84, getitem_94, convert_element_type_85, getitem_97, cat_10, squeeze_49, convert_element_type_87, mean_2, convert_element_type_91, convolution_35, convert_element_type_93, convert_element_type_95, convolution_36, getitem_100, getitem_101, convert_element_type_96, convert_element_type_97, cat_11, squeeze_52, getitem_104, getitem_105, convert_element_type_100, convert_element_type_101, cat_12, squeeze_55, convert_element_type_106, getitem_110, convert_element_type_107, getitem_113, cat_13, squeeze_58, convert_element_type_109, mean_3, convert_element_type_113, convolution_43, convert_element_type_115, convert_element_type_117, convolution_44, getitem_116, getitem_117, convert_element_type_118, convert_element_type_119, cat_14, squeeze_61, add_109, convert_element_type_122, convolution_47, squeeze_64, convert_element_type_127, getitem_125, convert_element_type_128, getitem_129, convert_element_type_129, getitem_133, cat_15, squeeze_67, convert_element_type_131, mean_4, convert_element_type_135, convolution_51, convert_element_type_137, convert_element_type_139, convolution_52, mul_180, convert_element_type_140, convolution_53, squeeze_70, getitem_138, getitem_139, convert_element_type_143, convert_element_type_144, cat_16, squeeze_73, convert_element_type_149, getitem_146, convert_element_type_150, getitem_151, convert_element_type_151, getitem_156, convert_element_type_152, getitem_161, cat_17, squeeze_76, convert_element_type_154, mean_5, convert_element_type_158, convolution_60, convert_element_type_160, convert_element_type_162, convolution_61, getitem_164, getitem_165, convert_element_type_163, convert_element_type_164, cat_18, squeeze_79, getitem_168, getitem_169, convert_element_type_167, convert_element_type_168, cat_19, squeeze_82, convert_element_type_173, getitem_176, convert_element_type_174, getitem_181, convert_element_type_175, getitem_186, convert_element_type_176, getitem_191, cat_20, squeeze_85, convert_element_type_178, mean_6, convert_element_type_182, convolution_70, convert_element_type_184, convert_element_type_186, convolution_71, getitem_194, getitem_195, convert_element_type_187, convert_element_type_188, cat_21, squeeze_88, getitem_198, getitem_199, convert_element_type_191, convert_element_type_192, cat_22, squeeze_91, convert_element_type_197, getitem_206, convert_element_type_198, getitem_211, convert_element_type_199, getitem_216, convert_element_type_200, getitem_221, cat_23, squeeze_94, convert_element_type_202, mean_7, convert_element_type_206, convolution_80, convert_element_type_208, convert_element_type_210, convolution_81, getitem_224, getitem_225, convert_element_type_211, convert_element_type_212, cat_24, squeeze_97, add_172, convert_element_type_215, convolution_84, squeeze_100, convert_element_type_219, convert_element_type_220, convolution_85, squeeze_103, convert_element_type_222, mean_8, convert_element_type_226, convolution_86, convert_element_type_228, convert_element_type_230, convolution_87, mul_280, convert_element_type_231, convolution_88, squeeze_106, getitem_234, getitem_235, convert_element_type_234, convert_element_type_235, cat_25, squeeze_109, convert_element_type_240, getitem_242, convert_element_type_241, getitem_247, convert_element_type_242, getitem_252, convert_element_type_243, getitem_257, cat_26, squeeze_112, convert_element_type_245, mean_9, convert_element_type_249, convolution_95, convert_element_type_251, convert_element_type_253, convolution_96, getitem_260, getitem_261, convert_element_type_254, convert_element_type_255, cat_27, squeeze_115, getitem_264, getitem_265, convert_element_type_258, convert_element_type_259, cat_28, squeeze_118, convert_element_type_264, getitem_272, convert_element_type_265, getitem_277, convert_element_type_266, getitem_282, convert_element_type_267, getitem_287, cat_29, squeeze_121, convert_element_type_269, mean_10, convert_element_type_273, convolution_105, convert_element_type_275, convert_element_type_277, convolution_106, getitem_290, getitem_291, convert_element_type_278, convert_element_type_279, cat_30, squeeze_124, getitem_294, getitem_295, convert_element_type_282, convert_element_type_283, cat_31, squeeze_127, convert_element_type_288, getitem_302, convert_element_type_289, getitem_307, convert_element_type_290, getitem_312, convert_element_type_291, getitem_317, cat_32, squeeze_130, convert_element_type_293, mean_11, convert_element_type_297, convolution_115, convert_element_type_299, convert_element_type_301, convolution_116, getitem_320, getitem_321, convert_element_type_302, convert_element_type_303, cat_33, squeeze_133, add_235, convert_element_type_306, convolution_119, squeeze_136, convert_element_type_311, getitem_330, convert_element_type_312, getitem_335, convert_element_type_313, getitem_340, convert_element_type_314, getitem_345, cat_34, squeeze_139, convert_element_type_316, mean_12, convert_element_type_320, convolution_124, convert_element_type_322, convert_element_type_324, convolution_125, mul_380, convert_element_type_325, convolution_126, squeeze_142, convert_element_type_327, convert_element_type_328, convolution_127, squeeze_145, convert_element_type_333, getitem_356, convert_element_type_334, getitem_361, convert_element_type_335, getitem_366, convert_element_type_336, getitem_371, cat_35, squeeze_148, convert_element_type_338, mean_13, convert_element_type_342, convolution_132, convert_element_type_344, convert_element_type_346, convolution_133, getitem_374, getitem_375, convert_element_type_347, convert_element_type_348, cat_36, squeeze_151, add_266, convert_element_type_351, convolution_136, squeeze_154, convert_element_type_356, getitem_384, convert_element_type_357, getitem_389, convert_element_type_358, getitem_394, convert_element_type_359, getitem_399, cat_37, squeeze_157, convert_element_type_361, mean_14, convert_element_type_365, convolution_141, convert_element_type_367, convert_element_type_369, convolution_142, getitem_402, getitem_403, convert_element_type_370, convert_element_type_371, cat_38, squeeze_160, add_282, convert_element_type_374, convolution_145, squeeze_163, convert_element_type_379, getitem_412, convert_element_type_380, getitem_417, convert_element_type_381, getitem_422, convert_element_type_382, getitem_427, cat_39, squeeze_166, convert_element_type_384, mean_15, convert_element_type_388, convolution_150, convert_element_type_390, convert_element_type_392, convolution_151, getitem_430, getitem_431, convert_element_type_393, convert_element_type_394, cat_40, squeeze_169, add_298, convert_element_type_397, convolution_154, squeeze_172, view, permute_1, le, unsqueeze_234, unsqueeze_246, unsqueeze_258, mul_508, unsqueeze_270, unsqueeze_282, unsqueeze_294, mul_548, unsqueeze_306, unsqueeze_318, unsqueeze_330, mul_588, unsqueeze_342, unsqueeze_354, unsqueeze_366, mul_628, unsqueeze_378, unsqueeze_390, unsqueeze_402, mul_668, unsqueeze_414, unsqueeze_426, unsqueeze_438, mul_708, unsqueeze_450, unsqueeze_462, unsqueeze_474, mul_748, unsqueeze_486, unsqueeze_498, unsqueeze_510, mul_788, unsqueeze_522, unsqueeze_534, unsqueeze_546, mul_828, unsqueeze_558, unsqueeze_570, unsqueeze_582, mul_868, unsqueeze_594, unsqueeze_606, unsqueeze_618, mul_908, unsqueeze_630, unsqueeze_642, unsqueeze_654, mul_948, unsqueeze_666, unsqueeze_678, unsqueeze_690, mul_988, unsqueeze_702, unsqueeze_714, unsqueeze_726, mul_1028, unsqueeze_738, unsqueeze_750, unsqueeze_762, mul_1068, unsqueeze_774, unsqueeze_786, unsqueeze_798, mul_1108, unsqueeze_810, unsqueeze_822, le_1, unsqueeze_834, unsqueeze_846, unsqueeze_858, le_3, unsqueeze_870, le_4, unsqueeze_882, unsqueeze_894, unsqueeze_906, unsqueeze_918, tangents_1, tangents_2, tangents_3, tangents_4, tangents_5, tangents_6, tangents_7, tangents_8, tangents_9, tangents_10, tangents_11, tangents_12, tangents_13, tangents_14, tangents_15, tangents_16, tangents_17, tangents_18, tangents_19, tangents_20, tangents_21, tangents_22, tangents_23, tangents_24, tangents_25, tangents_26, tangents_27, tangents_28, tangents_29, tangents_30, tangents_31, tangents_32, tangents_33, tangents_34, tangents_35, tangents_36, tangents_37, tangents_38, tangents_39, tangents_40, tangents_41, tangents_42, tangents_43, tangents_44, tangents_45, tangents_46, tangents_47, tangents_48, tangents_49, tangents_50, tangents_51, tangents_52, tangents_53, tangents_54, tangents_55, tangents_56, tangents_57, tangents_58, tangents_59, tangents_60, tangents_61, tangents_62, tangents_63, tangents_64, tangents_65, tangents_66, tangents_67, tangents_68, tangents_69, tangents_70, tangents_71, tangents_72, tangents_73, tangents_74, tangents_75, tangents_76, tangents_77, tangents_78, tangents_79, tangents_80, tangents_81, tangents_82, tangents_83, tangents_84, tangents_85, tangents_86, tangents_87, tangents_88, tangents_89, tangents_90, tangents_91, tangents_92, tangents_93, tangents_94, tangents_95, tangents_96, tangents_97, tangents_98, tangents_99, tangents_100, tangents_101, tangents_102, tangents_103, tangents_104, tangents_105, tangents_106, tangents_107, tangents_108, tangents_109, tangents_110, tangents_111, tangents_112, tangents_113, tangents_114, tangents_115, tangents_116, tangents_117, tangents_118, tangents_119, tangents_120, tangents_121, tangents_122, tangents_123, tangents_124, tangents_125, tangents_126, tangents_127, tangents_128, tangents_129, tangents_130, tangents_131, tangents_132, tangents_133, tangents_134, tangents_135, tangents_136, tangents_137, tangents_138, tangents_139, tangents_140, tangents_141, tangents_142, tangents_143, tangents_144, tangents_145, tangents_146, tangents_147, tangents_148, tangents_149, tangents_150, tangents_151, tangents_152, tangents_153, tangents_154, tangents_155, tangents_156, tangents_157, tangents_158, tangents_159, tangents_160, tangents_161, tangents_162, tangents_163, tangents_164, tangents_165, tangents_166, tangents_167, tangents_168, tangents_169, tangents_170, tangents_171, tangents_172, tangents_173, tangents_174, tangents_175 = args | |
args.clear() | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) # no-op to ensure context | |
buf0 = empty_strided((128, 1536), (1536, 1), device='cuda', dtype=torch.float16) | |
extern_kernels.mm(tangents_175, permute_1, out=buf0) | |
del permute_1 | |
buf1 = empty_strided((1000, 1536), (1536, 1), device='cuda', dtype=torch.float16) | |
extern_kernels.mm(as_strided(tangents_175, (1000, 128), (1, 1000)), view, out=buf1) | |
del view | |
buf4 = empty_strided((1000, ), (1, ), device='cuda', dtype=torch.float32) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__to_copy_sum_0.run(tangents_175, buf4, 1000, 128, grid=grid(1000), stream=stream0) | |
del tangents_175 | |
buf3 = empty_strided((1000, 1536), (1536, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_1.run(buf1, buf3, 1536000, grid=grid(1536000), stream=stream0) | |
del buf1 | |
buf5 = empty_strided((1536, ), (1, ), device='cuda', dtype=torch.float32) | |
buf6 = empty_strided((1536, ), (1, ), device='cuda', dtype=torch.float32) | |
buf7 = empty_strided((1536, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_div_native_batch_norm_backward_threshold_backward_2.run(le, buf0, convolution_154, unsqueeze_234, squeeze_172, buf5, buf6, buf7, 1536, 6272, grid=grid(1536), stream=stream0) | |
buf8 = empty_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_div_native_batch_norm_backward_threshold_backward_3.run(le, buf0, convolution_154, unsqueeze_234, buf6, squeeze_172, buf5, primals_115, buf8, 9633792, grid=grid(9633792), stream=stream0) | |
del buf0 | |
del convolution_154 | |
del le | |
del primals_115 | |
del squeeze_172 | |
del unsqueeze_234 | |
buf9 = aten.convolution_backward(buf8, add_298, convert_element_type_397, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del add_298 | |
del buf8 | |
del convert_element_type_397 | |
buf10 = buf9[0] | |
assert_size_stride(buf10, (128, 264, 7, 7), (12936, 49, 7, 1)) | |
buf11 = buf9[1] | |
assert_size_stride(buf11, (1536, 264, 1, 1), (264, 1, 1, 1)) | |
del buf9 | |
buf12 = empty_strided((1536, 264, 1, 1), (264, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_4.run(buf11, buf12, 405504, grid=grid(405504), stream=stream0) | |
del buf11 | |
buf13 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32) | |
buf14 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32) | |
buf15 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_5.run(buf10, cat_40, unsqueeze_246, squeeze_169, buf13, buf14, buf15, 264, 6272, grid=grid(264), stream=stream0) | |
buf16 = empty_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_6.run(buf10, cat_40, unsqueeze_246, buf14, squeeze_169, buf13, primals_113, buf16, 1655808, grid=grid(1655808), stream=stream0) | |
del cat_40 | |
del primals_113 | |
del squeeze_169 | |
del unsqueeze_246 | |
buf17 = aten.convolution_backward(as_strided(buf16, (128, 132, 7, 7), (12936, 49, 7, 1), 6468), getitem_431, convert_element_type_394, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del convert_element_type_394 | |
del getitem_431 | |
buf18 = buf17[0] | |
assert_size_stride(buf18, (128, 792, 7, 7), (38808, 49, 7, 1)) | |
buf19 = buf17[1] | |
assert_size_stride(buf19, (132, 792, 1, 1), (792, 1, 1, 1)) | |
del buf17 | |
buf20 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_7.run(buf19, buf20, 104544, grid=grid(104544), stream=stream0) | |
del buf19 | |
buf21 = aten.convolution_backward(as_strided(buf16, (128, 132, 7, 7), (12936, 49, 7, 1)), getitem_430, convert_element_type_393, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del convert_element_type_393 | |
del getitem_430 | |
buf22 = buf21[0] | |
assert_size_stride(buf22, (128, 792, 7, 7), (38808, 49, 7, 1)) | |
buf23 = buf21[1] | |
assert_size_stride(buf23, (132, 792, 1, 1), (792, 1, 1, 1)) | |
del buf21 | |
buf24 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_7.run(buf23, buf24, 104544, grid=grid(104544), stream=stream0) | |
del buf23 | |
buf27 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16) | |
buf25 = as_strided(buf27, (128, 792, 7, 7), (77616, 49, 7, 1)) # alias | |
triton_poi_fused_cat_8.run(buf22, buf25, 4967424, grid=grid(4967424), stream=stream0) | |
del buf22 | |
buf26 = as_strided(buf27, (128, 792, 7, 7), (77616, 49, 7, 1), 38808) # alias | |
triton_poi_fused_cat_9.run(buf18, buf26, 4967424, grid=grid(4967424), stream=stream0) | |
del buf18 | |
buf28 = empty_strided((128, 1584, 1, 1), (1584, 1, 202752, 202752), device='cuda', dtype=torch.float16) | |
buf29 = as_strided(buf28, (128, 1584, 1, 1), (1584, 1, 1, 1)); del buf28 # reuse | |
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_10.run(buf29, buf27, convert_element_type_384, convolution_151, 202752, 49, grid=grid(202752), stream=stream0) | |
del buf25 | |
del buf26 | |
buf35 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__to_copy_convolution_backward_11.run(buf29, buf35, 1584, 128, grid=grid(1584), stream=stream0) | |
buf31 = aten.convolution_backward(buf29, convert_element_type_390, convert_element_type_392, [1584], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del buf29 | |
del convert_element_type_390 | |
del convert_element_type_392 | |
buf32 = buf31[0] | |
assert_size_stride(buf32, (128, 132, 1, 1), (132, 1, 1, 1)) | |
buf33 = buf31[1] | |
assert_size_stride(buf33, (1584, 132, 1, 1), (132, 1, 1, 1)) | |
del buf31 | |
buf34 = empty_strided((1584, 132, 1, 1), (132, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_12.run(buf33, buf34, 209088, grid=grid(209088), stream=stream0) | |
del buf33 | |
buf37 = buf32; del buf32 # reuse | |
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_13.run(buf37, convolution_150, 16896, grid=grid(16896), stream=stream0) | |
del convolution_150 | |
buf43 = empty_strided((132, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__to_copy_convolution_backward_14.run(buf37, buf43, 132, 128, grid=grid(132), stream=stream0) | |
buf39 = aten.convolution_backward(buf37, mean_15, convert_element_type_388, [132], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del buf37 | |
del convert_element_type_388 | |
del mean_15 | |
buf40 = buf39[0] | |
assert_size_stride(buf40, (128, 1584, 1, 1), (1584, 1, 1, 1)) | |
buf41 = buf39[1] | |
assert_size_stride(buf41, (132, 1584, 1, 1), (1584, 1, 1, 1)) | |
del buf39 | |
buf42 = empty_strided((132, 1584, 1, 1), (1584, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_15.run(buf41, buf42, 209088, grid=grid(209088), stream=stream0) | |
del buf41 | |
buf45 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
buf46 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
buf48 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_16.run(buf27, convolution_151, buf40, convert_element_type_384, cat_39, unsqueeze_258, squeeze_166, buf45, buf46, buf48, 1584, 6272, grid=grid(1584), stream=stream0) | |
buf47 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_17.run(buf27, convolution_151, buf40, convert_element_type_384, cat_39, unsqueeze_258, buf46, squeeze_166, buf45, buf47, 9934848, grid=grid(9934848), stream=stream0) | |
del cat_39 | |
del convert_element_type_384 | |
del convolution_151 | |
del unsqueeze_258 | |
buf49 = empty_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_convolution_backward_18.run(buf47, squeeze_166, primals_111, buf49, 2483712, grid=grid(2483712), stream=stream0) | |
buf50 = aten.convolution_backward(buf49, getitem_427, convert_element_type_382, [0], [1, 1], [4, 4], [1, 1], False, [0, 0], 396, [True, True, False]) | |
del convert_element_type_382 | |
del getitem_427 | |
buf51 = buf50[0] | |
assert_size_stride(buf51, (128, 396, 7, 7), (19404, 49, 7, 1)) | |
buf52 = buf50[1] | |
assert_size_stride(buf52, (396, 1, 9, 9), (81, 81, 9, 1)) | |
del buf50 | |
buf53 = empty_strided((396, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_19.run(buf52, buf53, 32076, grid=grid(32076), stream=stream0) | |
del buf52 | |
buf54 = buf49; del buf49 # reuse | |
triton_poi_fused_convolution_backward_20.run(buf47, squeeze_166, primals_111, buf54, 2483712, grid=grid(2483712), stream=stream0) | |
buf55 = aten.convolution_backward(buf54, getitem_422, convert_element_type_381, [0], [1, 1], [3, 3], [1, 1], False, [0, 0], 396, [True, True, False]) | |
del convert_element_type_381 | |
del getitem_422 | |
buf56 = buf55[0] | |
assert_size_stride(buf56, (128, 396, 7, 7), (19404, 49, 7, 1)) | |
buf57 = buf55[1] | |
assert_size_stride(buf57, (396, 1, 7, 7), (49, 49, 7, 1)) | |
del buf55 | |
buf58 = empty_strided((396, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_21.run(buf57, buf58, 19404, grid=grid(19404), stream=stream0) | |
del buf57 | |
buf59 = buf54; del buf54 # reuse | |
triton_poi_fused_convolution_backward_22.run(buf47, squeeze_166, primals_111, buf59, 2483712, grid=grid(2483712), stream=stream0) | |
buf60 = aten.convolution_backward(buf59, getitem_417, convert_element_type_380, [0], [1, 1], [2, 2], [1, 1], False, [0, 0], 396, [True, True, False]) | |
del convert_element_type_380 | |
del getitem_417 | |
buf61 = buf60[0] | |
assert_size_stride(buf61, (128, 396, 7, 7), (19404, 49, 7, 1)) | |
buf62 = buf60[1] | |
assert_size_stride(buf62, (396, 1, 5, 5), (25, 25, 5, 1)) | |
del buf60 | |
buf63 = empty_strided((396, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_23.run(buf62, buf63, 9900, grid=grid(9900), stream=stream0) | |
del buf62 | |
buf64 = buf59; del buf59 # reuse | |
triton_poi_fused_convolution_backward_24.run(buf47, squeeze_166, primals_111, buf64, 2483712, grid=grid(2483712), stream=stream0) | |
del primals_111 | |
del squeeze_166 | |
buf65 = aten.convolution_backward(buf64, getitem_412, convert_element_type_379, [0], [1, 1], [1, 1], [1, 1], False, [0, 0], 396, [True, True, False]) | |
del buf64 | |
del convert_element_type_379 | |
del getitem_412 | |
buf66 = buf65[0] | |
assert_size_stride(buf66, (128, 396, 7, 7), (19404, 49, 7, 1)) | |
buf67 = buf65[1] | |
assert_size_stride(buf67, (396, 1, 3, 3), (9, 9, 3, 1)) | |
del buf65 | |
buf68 = empty_strided((396, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_25.run(buf67, buf68, 3564, grid=grid(3564), stream=stream0) | |
del buf67 | |
buf73 = buf27; del buf27 # reuse | |
buf69 = as_strided(buf73, (128, 396, 7, 7), (77616, 49, 7, 1)) # alias | |
triton_poi_fused_cat_26.run(buf66, buf69, 2483712, grid=grid(2483712), stream=stream0) | |
del buf66 | |
buf70 = as_strided(buf73, (128, 396, 7, 7), (77616, 49, 7, 1), 19404) # alias | |
triton_poi_fused_cat_27.run(buf61, buf70, 2483712, grid=grid(2483712), stream=stream0) | |
del buf61 | |
buf71 = as_strided(buf73, (128, 396, 7, 7), (77616, 49, 7, 1), 38808) # alias | |
triton_poi_fused_cat_27.run(buf56, buf71, 2483712, grid=grid(2483712), stream=stream0) | |
del buf56 | |
buf72 = as_strided(buf73, (128, 396, 7, 7), (77616, 49, 7, 1), 58212) # alias | |
triton_poi_fused_cat_27.run(buf51, buf72, 2483712, grid=grid(2483712), stream=stream0) | |
buf74 = buf46; del buf46 # reuse | |
buf75 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
buf76 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_28.run(buf73, mul_508, convolution_145, unsqueeze_270, squeeze_163, buf74, buf75, buf76, 1584, 6272, grid=grid(1584), stream=stream0) | |
del buf69 | |
del buf70 | |
del buf71 | |
del buf72 | |
buf77 = buf73; del buf73 # reuse | |
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_29.run(buf77, mul_508, convolution_145, unsqueeze_270, buf75, squeeze_163, buf74, primals_109, 9934848, grid=grid(9934848), stream=stream0) | |
del convolution_145 | |
del mul_508 | |
del primals_109 | |
del squeeze_163 | |
del unsqueeze_270 | |
buf78 = aten.convolution_backward(buf77, add_282, convert_element_type_374, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del add_282 | |
del convert_element_type_374 | |
buf79 = buf78[0] | |
assert_size_stride(buf79, (128, 264, 7, 7), (12936, 49, 7, 1)) | |
buf80 = buf78[1] | |
assert_size_stride(buf80, (1584, 264, 1, 1), (264, 1, 1, 1)) | |
del buf78 | |
buf81 = empty_strided((1584, 264, 1, 1), (264, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_30.run(buf80, buf81, 418176, grid=grid(418176), stream=stream0) | |
del buf80 | |
buf82 = buf14; del buf14 # reuse | |
buf83 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32) | |
buf84 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_31.run(buf10, buf79, cat_38, unsqueeze_282, squeeze_160, buf82, buf83, buf84, 264, 6272, grid=grid(264), stream=stream0) | |
buf85 = buf16; del buf16 # reuse | |
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_32.run(buf10, buf79, cat_38, unsqueeze_282, buf83, squeeze_160, buf82, primals_107, buf85, 1655808, grid=grid(1655808), stream=stream0) | |
del cat_38 | |
del primals_107 | |
del squeeze_160 | |
del unsqueeze_282 | |
buf86 = aten.convolution_backward(as_strided(buf85, (128, 132, 7, 7), (12936, 49, 7, 1), 6468), getitem_403, convert_element_type_371, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del convert_element_type_371 | |
del getitem_403 | |
buf87 = buf86[0] | |
assert_size_stride(buf87, (128, 792, 7, 7), (38808, 49, 7, 1)) | |
buf88 = buf86[1] | |
assert_size_stride(buf88, (132, 792, 1, 1), (792, 1, 1, 1)) | |
del buf86 | |
buf89 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_7.run(buf88, buf89, 104544, grid=grid(104544), stream=stream0) | |
del buf88 | |
buf90 = aten.convolution_backward(as_strided(buf85, (128, 132, 7, 7), (12936, 49, 7, 1)), getitem_402, convert_element_type_370, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del convert_element_type_370 | |
del getitem_402 | |
buf91 = buf90[0] | |
assert_size_stride(buf91, (128, 792, 7, 7), (38808, 49, 7, 1)) | |
buf92 = buf90[1] | |
assert_size_stride(buf92, (132, 792, 1, 1), (792, 1, 1, 1)) | |
del buf90 | |
buf93 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_7.run(buf92, buf93, 104544, grid=grid(104544), stream=stream0) | |
del buf92 | |
buf96 = buf77; del buf77 # reuse | |
buf94 = as_strided(buf96, (128, 792, 7, 7), (77616, 49, 7, 1)) # alias | |
triton_poi_fused_cat_8.run(buf91, buf94, 4967424, grid=grid(4967424), stream=stream0) | |
del buf91 | |
buf95 = as_strided(buf96, (128, 792, 7, 7), (77616, 49, 7, 1), 38808) # alias | |
triton_poi_fused_cat_9.run(buf87, buf95, 4967424, grid=grid(4967424), stream=stream0) | |
del buf87 | |
buf97 = as_strided(buf40, (128, 1584, 1, 1), (1584, 1, 202752, 202752)); del buf40 # reuse | |
buf98 = as_strided(buf97, (128, 1584, 1, 1), (1584, 1, 1, 1)); del buf97 # reuse | |
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_10.run(buf98, buf96, convert_element_type_361, convolution_142, 202752, 49, grid=grid(202752), stream=stream0) | |
del buf94 | |
del buf95 | |
buf104 = buf75; del buf75 # reuse | |
triton_per_fused__to_copy_convolution_backward_11.run(buf98, buf104, 1584, 128, grid=grid(1584), stream=stream0) | |
buf100 = aten.convolution_backward(buf98, convert_element_type_367, convert_element_type_369, [1584], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del buf98 | |
del convert_element_type_367 | |
del convert_element_type_369 | |
buf101 = buf100[0] | |
assert_size_stride(buf101, (128, 132, 1, 1), (132, 1, 1, 1)) | |
buf102 = buf100[1] | |
assert_size_stride(buf102, (1584, 132, 1, 1), (132, 1, 1, 1)) | |
del buf100 | |
buf103 = empty_strided((1584, 132, 1, 1), (132, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_12.run(buf102, buf103, 209088, grid=grid(209088), stream=stream0) | |
del buf102 | |
buf106 = buf101; del buf101 # reuse | |
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_13.run(buf106, convolution_141, 16896, grid=grid(16896), stream=stream0) | |
del convolution_141 | |
buf112 = empty_strided((132, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__to_copy_convolution_backward_14.run(buf106, buf112, 132, 128, grid=grid(132), stream=stream0) | |
buf108 = aten.convolution_backward(buf106, mean_14, convert_element_type_365, [132], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del buf106 | |
del convert_element_type_365 | |
del mean_14 | |
buf109 = buf108[0] | |
assert_size_stride(buf109, (128, 1584, 1, 1), (1584, 1, 1, 1)) | |
buf110 = buf108[1] | |
assert_size_stride(buf110, (132, 1584, 1, 1), (1584, 1, 1, 1)) | |
del buf108 | |
buf111 = empty_strided((132, 1584, 1, 1), (1584, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_15.run(buf110, buf111, 209088, grid=grid(209088), stream=stream0) | |
del buf110 | |
buf114 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
buf115 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
buf117 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_16.run(buf96, convolution_142, buf109, convert_element_type_361, cat_37, unsqueeze_294, squeeze_157, buf114, buf115, buf117, 1584, 6272, grid=grid(1584), stream=stream0) | |
buf116 = buf47; del buf47 # reuse | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_17.run(buf96, convolution_142, buf109, convert_element_type_361, cat_37, unsqueeze_294, buf115, squeeze_157, buf114, buf116, 9934848, grid=grid(9934848), stream=stream0) | |
del cat_37 | |
del convert_element_type_361 | |
del convolution_142 | |
del unsqueeze_294 | |
buf118 = buf51; del buf51 # reuse | |
triton_poi_fused_convolution_backward_18.run(buf116, squeeze_157, primals_105, buf118, 2483712, grid=grid(2483712), stream=stream0) | |
buf119 = aten.convolution_backward(buf118, getitem_399, convert_element_type_359, [0], [1, 1], [4, 4], [1, 1], False, [0, 0], 396, [True, True, False]) | |
del convert_element_type_359 | |
del getitem_399 | |
buf120 = buf119[0] | |
assert_size_stride(buf120, (128, 396, 7, 7), (19404, 49, 7, 1)) | |
buf121 = buf119[1] | |
assert_size_stride(buf121, (396, 1, 9, 9), (81, 81, 9, 1)) | |
del buf119 | |
buf122 = empty_strided((396, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_19.run(buf121, buf122, 32076, grid=grid(32076), stream=stream0) | |
del buf121 | |
buf123 = buf118; del buf118 # reuse | |
triton_poi_fused_convolution_backward_20.run(buf116, squeeze_157, primals_105, buf123, 2483712, grid=grid(2483712), stream=stream0) | |
buf124 = aten.convolution_backward(buf123, getitem_394, convert_element_type_358, [0], [1, 1], [3, 3], [1, 1], False, [0, 0], 396, [True, True, False]) | |
del convert_element_type_358 | |
del getitem_394 | |
buf125 = buf124[0] | |
assert_size_stride(buf125, (128, 396, 7, 7), (19404, 49, 7, 1)) | |
buf126 = buf124[1] | |
assert_size_stride(buf126, (396, 1, 7, 7), (49, 49, 7, 1)) | |
del buf124 | |
buf127 = empty_strided((396, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_21.run(buf126, buf127, 19404, grid=grid(19404), stream=stream0) | |
del buf126 | |
buf128 = buf123; del buf123 # reuse | |
triton_poi_fused_convolution_backward_22.run(buf116, squeeze_157, primals_105, buf128, 2483712, grid=grid(2483712), stream=stream0) | |
buf129 = aten.convolution_backward(buf128, getitem_389, convert_element_type_357, [0], [1, 1], [2, 2], [1, 1], False, [0, 0], 396, [True, True, False]) | |
del convert_element_type_357 | |
del getitem_389 | |
buf130 = buf129[0] | |
assert_size_stride(buf130, (128, 396, 7, 7), (19404, 49, 7, 1)) | |
buf131 = buf129[1] | |
assert_size_stride(buf131, (396, 1, 5, 5), (25, 25, 5, 1)) | |
del buf129 | |
buf132 = empty_strided((396, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_23.run(buf131, buf132, 9900, grid=grid(9900), stream=stream0) | |
del buf131 | |
buf133 = buf128; del buf128 # reuse | |
triton_poi_fused_convolution_backward_24.run(buf116, squeeze_157, primals_105, buf133, 2483712, grid=grid(2483712), stream=stream0) | |
del primals_105 | |
del squeeze_157 | |
buf134 = aten.convolution_backward(buf133, getitem_384, convert_element_type_356, [0], [1, 1], [1, 1], [1, 1], False, [0, 0], 396, [True, True, False]) | |
del buf133 | |
del convert_element_type_356 | |
del getitem_384 | |
buf135 = buf134[0] | |
assert_size_stride(buf135, (128, 396, 7, 7), (19404, 49, 7, 1)) | |
buf136 = buf134[1] | |
assert_size_stride(buf136, (396, 1, 3, 3), (9, 9, 3, 1)) | |
del buf134 | |
buf137 = empty_strided((396, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_25.run(buf136, buf137, 3564, grid=grid(3564), stream=stream0) | |
del buf136 | |
buf142 = buf96; del buf96 # reuse | |
buf138 = as_strided(buf142, (128, 396, 7, 7), (77616, 49, 7, 1)) # alias | |
triton_poi_fused_cat_26.run(buf135, buf138, 2483712, grid=grid(2483712), stream=stream0) | |
del buf135 | |
buf139 = as_strided(buf142, (128, 396, 7, 7), (77616, 49, 7, 1), 19404) # alias | |
triton_poi_fused_cat_27.run(buf130, buf139, 2483712, grid=grid(2483712), stream=stream0) | |
del buf130 | |
buf140 = as_strided(buf142, (128, 396, 7, 7), (77616, 49, 7, 1), 38808) # alias | |
triton_poi_fused_cat_27.run(buf125, buf140, 2483712, grid=grid(2483712), stream=stream0) | |
del buf125 | |
buf141 = as_strided(buf142, (128, 396, 7, 7), (77616, 49, 7, 1), 58212) # alias | |
triton_poi_fused_cat_27.run(buf120, buf141, 2483712, grid=grid(2483712), stream=stream0) | |
buf143 = buf115; del buf115 # reuse | |
buf144 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
buf145 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_28.run(buf142, mul_548, convolution_136, unsqueeze_306, squeeze_154, buf143, buf144, buf145, 1584, 6272, grid=grid(1584), stream=stream0) | |
del buf138 | |
del buf139 | |
del buf140 | |
del buf141 | |
buf146 = buf142; del buf142 # reuse | |
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_29.run(buf146, mul_548, convolution_136, unsqueeze_306, buf144, squeeze_154, buf143, primals_103, 9934848, grid=grid(9934848), stream=stream0) | |
del convolution_136 | |
del mul_548 | |
del primals_103 | |
del squeeze_154 | |
del unsqueeze_306 | |
buf147 = aten.convolution_backward(buf146, add_266, convert_element_type_351, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del add_266 | |
del convert_element_type_351 | |
buf148 = buf147[0] | |
assert_size_stride(buf148, (128, 264, 7, 7), (12936, 49, 7, 1)) | |
buf149 = buf147[1] | |
assert_size_stride(buf149, (1584, 264, 1, 1), (264, 1, 1, 1)) | |
del buf147 | |
buf150 = empty_strided((1584, 264, 1, 1), (264, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_30.run(buf149, buf150, 418176, grid=grid(418176), stream=stream0) | |
del buf149 | |
buf151 = buf83; del buf83 # reuse | |
buf152 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32) | |
buf154 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_33.run(buf10, buf79, buf148, cat_36, unsqueeze_318, squeeze_151, buf151, buf152, buf154, 264, 6272, grid=grid(264), stream=stream0) | |
buf153 = empty_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_34.run(buf10, buf79, buf148, cat_36, unsqueeze_318, buf152, squeeze_151, buf151, primals_101, buf153, 1655808, grid=grid(1655808), stream=stream0) | |
del cat_36 | |
del primals_101 | |
del squeeze_151 | |
del unsqueeze_318 | |
buf155 = empty_strided((128, 132, 7, 7), (6468, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_convolution_backward_35.run(buf153, buf155, 827904, grid=grid(827904), stream=stream0) | |
buf156 = aten.convolution_backward(buf155, getitem_375, convert_element_type_348, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del convert_element_type_348 | |
del getitem_375 | |
buf157 = buf156[0] | |
assert_size_stride(buf157, (128, 792, 7, 7), (38808, 49, 7, 1)) | |
buf158 = buf156[1] | |
assert_size_stride(buf158, (132, 792, 1, 1), (792, 1, 1, 1)) | |
del buf156 | |
buf159 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_7.run(buf158, buf159, 104544, grid=grid(104544), stream=stream0) | |
del buf158 | |
buf160 = buf155; del buf155 # reuse | |
triton_poi_fused_convolution_backward_36.run(buf153, buf160, 827904, grid=grid(827904), stream=stream0) | |
del buf153 | |
buf161 = aten.convolution_backward(buf160, getitem_374, convert_element_type_347, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del buf160 | |
del convert_element_type_347 | |
del getitem_374 | |
buf162 = buf161[0] | |
assert_size_stride(buf162, (128, 792, 7, 7), (38808, 49, 7, 1)) | |
buf163 = buf161[1] | |
assert_size_stride(buf163, (132, 792, 1, 1), (792, 1, 1, 1)) | |
del buf161 | |
buf164 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_7.run(buf163, buf164, 104544, grid=grid(104544), stream=stream0) | |
del buf163 | |
buf167 = buf146; del buf146 # reuse | |
buf165 = as_strided(buf167, (128, 792, 7, 7), (77616, 49, 7, 1)) # alias | |
triton_poi_fused_cat_8.run(buf162, buf165, 4967424, grid=grid(4967424), stream=stream0) | |
del buf162 | |
buf166 = as_strided(buf167, (128, 792, 7, 7), (77616, 49, 7, 1), 38808) # alias | |
triton_poi_fused_cat_9.run(buf157, buf166, 4967424, grid=grid(4967424), stream=stream0) | |
del buf157 | |
buf168 = as_strided(buf109, (128, 1584, 1, 1), (1584, 1, 202752, 202752)); del buf109 # reuse | |
buf169 = as_strided(buf168, (128, 1584, 1, 1), (1584, 1, 1, 1)); del buf168 # reuse | |
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_10.run(buf169, buf167, convert_element_type_338, convolution_133, 202752, 49, grid=grid(202752), stream=stream0) | |
del buf165 | |
del buf166 | |
buf175 = buf144; del buf144 # reuse | |
triton_per_fused__to_copy_convolution_backward_11.run(buf169, buf175, 1584, 128, grid=grid(1584), stream=stream0) | |
buf171 = aten.convolution_backward(buf169, convert_element_type_344, convert_element_type_346, [1584], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del buf169 | |
del convert_element_type_344 | |
del convert_element_type_346 | |
buf172 = buf171[0] | |
assert_size_stride(buf172, (128, 132, 1, 1), (132, 1, 1, 1)) | |
buf173 = buf171[1] | |
assert_size_stride(buf173, (1584, 132, 1, 1), (132, 1, 1, 1)) | |
del buf171 | |
buf174 = empty_strided((1584, 132, 1, 1), (132, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_12.run(buf173, buf174, 209088, grid=grid(209088), stream=stream0) | |
del buf173 | |
buf177 = buf172; del buf172 # reuse | |
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_13.run(buf177, convolution_132, 16896, grid=grid(16896), stream=stream0) | |
del convolution_132 | |
buf183 = empty_strided((132, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__to_copy_convolution_backward_14.run(buf177, buf183, 132, 128, grid=grid(132), stream=stream0) | |
buf179 = aten.convolution_backward(buf177, mean_13, convert_element_type_342, [132], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del buf177 | |
del convert_element_type_342 | |
del mean_13 | |
buf180 = buf179[0] | |
assert_size_stride(buf180, (128, 1584, 1, 1), (1584, 1, 1, 1)) | |
buf181 = buf179[1] | |
assert_size_stride(buf181, (132, 1584, 1, 1), (1584, 1, 1, 1)) | |
del buf179 | |
buf182 = empty_strided((132, 1584, 1, 1), (1584, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_15.run(buf181, buf182, 209088, grid=grid(209088), stream=stream0) | |
del buf181 | |
buf185 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
buf186 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
buf188 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_16.run(buf167, convolution_133, buf180, convert_element_type_338, cat_35, unsqueeze_330, squeeze_148, buf185, buf186, buf188, 1584, 6272, grid=grid(1584), stream=stream0) | |
buf187 = buf116; del buf116 # reuse | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_17.run(buf167, convolution_133, buf180, convert_element_type_338, cat_35, unsqueeze_330, buf186, squeeze_148, buf185, buf187, 9934848, grid=grid(9934848), stream=stream0) | |
del buf180 | |
del cat_35 | |
del convert_element_type_338 | |
del convolution_133 | |
del unsqueeze_330 | |
buf189 = buf120; del buf120 # reuse | |
triton_poi_fused_convolution_backward_18.run(buf187, squeeze_148, primals_99, buf189, 2483712, grid=grid(2483712), stream=stream0) | |
buf190 = aten.convolution_backward(buf189, getitem_371, convert_element_type_336, [0], [1, 1], [4, 4], [1, 1], False, [0, 0], 396, [True, True, False]) | |
del convert_element_type_336 | |
del getitem_371 | |
buf191 = buf190[0] | |
assert_size_stride(buf191, (128, 396, 7, 7), (19404, 49, 7, 1)) | |
buf192 = buf190[1] | |
assert_size_stride(buf192, (396, 1, 9, 9), (81, 81, 9, 1)) | |
del buf190 | |
buf193 = empty_strided((396, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_19.run(buf192, buf193, 32076, grid=grid(32076), stream=stream0) | |
del buf192 | |
buf194 = buf189; del buf189 # reuse | |
triton_poi_fused_convolution_backward_20.run(buf187, squeeze_148, primals_99, buf194, 2483712, grid=grid(2483712), stream=stream0) | |
buf195 = aten.convolution_backward(buf194, getitem_366, convert_element_type_335, [0], [1, 1], [3, 3], [1, 1], False, [0, 0], 396, [True, True, False]) | |
del convert_element_type_335 | |
del getitem_366 | |
buf196 = buf195[0] | |
assert_size_stride(buf196, (128, 396, 7, 7), (19404, 49, 7, 1)) | |
buf197 = buf195[1] | |
assert_size_stride(buf197, (396, 1, 7, 7), (49, 49, 7, 1)) | |
del buf195 | |
buf198 = empty_strided((396, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_21.run(buf197, buf198, 19404, grid=grid(19404), stream=stream0) | |
del buf197 | |
buf199 = buf194; del buf194 # reuse | |
triton_poi_fused_convolution_backward_22.run(buf187, squeeze_148, primals_99, buf199, 2483712, grid=grid(2483712), stream=stream0) | |
buf200 = aten.convolution_backward(buf199, getitem_361, convert_element_type_334, [0], [1, 1], [2, 2], [1, 1], False, [0, 0], 396, [True, True, False]) | |
del convert_element_type_334 | |
del getitem_361 | |
buf201 = buf200[0] | |
assert_size_stride(buf201, (128, 396, 7, 7), (19404, 49, 7, 1)) | |
buf202 = buf200[1] | |
assert_size_stride(buf202, (396, 1, 5, 5), (25, 25, 5, 1)) | |
del buf200 | |
buf203 = empty_strided((396, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_23.run(buf202, buf203, 9900, grid=grid(9900), stream=stream0) | |
del buf202 | |
buf204 = buf199; del buf199 # reuse | |
triton_poi_fused_convolution_backward_24.run(buf187, squeeze_148, primals_99, buf204, 2483712, grid=grid(2483712), stream=stream0) | |
del buf187 | |
del primals_99 | |
del squeeze_148 | |
buf205 = aten.convolution_backward(buf204, getitem_356, convert_element_type_333, [0], [1, 1], [1, 1], [1, 1], False, [0, 0], 396, [True, True, False]) | |
del buf204 | |
del convert_element_type_333 | |
del getitem_356 | |
buf206 = buf205[0] | |
assert_size_stride(buf206, (128, 396, 7, 7), (19404, 49, 7, 1)) | |
buf207 = buf205[1] | |
assert_size_stride(buf207, (396, 1, 3, 3), (9, 9, 3, 1)) | |
del buf205 | |
buf208 = empty_strided((396, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_25.run(buf207, buf208, 3564, grid=grid(3564), stream=stream0) | |
del buf207 | |
buf213 = buf167; del buf167 # reuse | |
buf209 = as_strided(buf213, (128, 396, 7, 7), (77616, 49, 7, 1)) # alias | |
triton_poi_fused_cat_26.run(buf206, buf209, 2483712, grid=grid(2483712), stream=stream0) | |
del buf206 | |
buf210 = as_strided(buf213, (128, 396, 7, 7), (77616, 49, 7, 1), 19404) # alias | |
triton_poi_fused_cat_27.run(buf201, buf210, 2483712, grid=grid(2483712), stream=stream0) | |
del buf201 | |
buf211 = as_strided(buf213, (128, 396, 7, 7), (77616, 49, 7, 1), 38808) # alias | |
triton_poi_fused_cat_27.run(buf196, buf211, 2483712, grid=grid(2483712), stream=stream0) | |
del buf196 | |
buf212 = as_strided(buf213, (128, 396, 7, 7), (77616, 49, 7, 1), 58212) # alias | |
triton_poi_fused_cat_27.run(buf191, buf212, 2483712, grid=grid(2483712), stream=stream0) | |
del buf191 | |
buf214 = buf186; del buf186 # reuse | |
buf215 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
buf216 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_28.run(buf213, mul_588, convolution_127, unsqueeze_342, squeeze_145, buf214, buf215, buf216, 1584, 6272, grid=grid(1584), stream=stream0) | |
del buf209 | |
del buf210 | |
del buf211 | |
del buf212 | |
buf217 = buf213; del buf213 # reuse | |
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_29.run(buf217, mul_588, convolution_127, unsqueeze_342, buf215, squeeze_145, buf214, primals_97, 9934848, grid=grid(9934848), stream=stream0) | |
del buf215 | |
del convolution_127 | |
del mul_588 | |
del primals_97 | |
del squeeze_145 | |
del unsqueeze_342 | |
buf218 = aten.convolution_backward(buf217, convert_element_type_327, convert_element_type_328, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del buf217 | |
del convert_element_type_327 | |
del convert_element_type_328 | |
buf219 = buf218[0] | |
assert_size_stride(buf219, (128, 264, 7, 7), (12936, 49, 7, 1)) | |
buf220 = buf218[1] | |
assert_size_stride(buf220, (1584, 264, 1, 1), (264, 1, 1, 1)) | |
del buf218 | |
buf221 = empty_strided((1584, 264, 1, 1), (264, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_30.run(buf220, buf221, 418176, grid=grid(418176), stream=stream0) | |
del buf220 | |
buf222 = buf152; del buf152 # reuse | |
buf223 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32) | |
buf225 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_37.run(buf10, buf79, buf148, buf219, convolution_126, unsqueeze_354, squeeze_142, buf222, buf223, buf225, 264, 6272, grid=grid(264), stream=stream0) | |
buf226 = buf85; del buf85 # reuse | |
triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_38.run(buf10, buf79, buf148, buf219, convolution_126, unsqueeze_354, buf223, squeeze_142, buf222, primals_95, buf226, 1655808, grid=grid(1655808), stream=stream0) | |
del buf10 | |
del buf148 | |
del buf219 | |
del buf223 | |
del buf79 | |
del convolution_126 | |
del primals_95 | |
del squeeze_142 | |
del unsqueeze_354 | |
buf227 = aten.convolution_backward(buf226, mul_380, convert_element_type_325, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del buf226 | |
del convert_element_type_325 | |
del mul_380 | |
buf228 = buf227[0] | |
assert_size_stride(buf228, (128, 960, 7, 7), (47040, 49, 7, 1)) | |
buf229 = buf227[1] | |
assert_size_stride(buf229, (264, 960, 1, 1), (960, 1, 1, 1)) | |
del buf227 | |
buf230 = empty_strided((264, 960, 1, 1), (960, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_39.run(buf229, buf230, 253440, grid=grid(253440), stream=stream0) | |
del buf229 | |
buf231 = empty_strided((128, 960, 1, 1), (960, 1, 122880, 122880), device='cuda', dtype=torch.float16) | |
buf232 = as_strided(buf231, (128, 960, 1, 1), (960, 1, 1, 1)); del buf231 # reuse | |
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_40.run(buf232, buf228, convert_element_type_316, convolution_125, 122880, 49, grid=grid(122880), stream=stream0) | |
buf238 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__to_copy_convolution_backward_41.run(buf232, buf238, 960, 128, grid=grid(960), stream=stream0) | |
buf234 = aten.convolution_backward(buf232, convert_element_type_322, convert_element_type_324, [960], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del buf232 | |
del convert_element_type_322 | |
del convert_element_type_324 | |
buf235 = buf234[0] | |
assert_size_stride(buf235, (128, 80, 1, 1), (80, 1, 1, 1)) | |
buf236 = buf234[1] | |
assert_size_stride(buf236, (960, 80, 1, 1), (80, 1, 1, 1)) | |
del buf234 | |
buf237 = empty_strided((960, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_42.run(buf236, buf237, 76800, grid=grid(76800), stream=stream0) | |
del buf236 | |
buf240 = buf235; del buf235 # reuse | |
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_43.run(buf240, convolution_124, 10240, grid=grid(10240), stream=stream0) | |
del convolution_124 | |
buf246 = empty_strided((80, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__to_copy_convolution_backward_44.run(buf240, buf246, 80, 128, grid=grid(80), stream=stream0) | |
buf242 = aten.convolution_backward(buf240, mean_12, convert_element_type_320, [80], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del buf240 | |
del convert_element_type_320 | |
del mean_12 | |
buf243 = buf242[0] | |
assert_size_stride(buf243, (128, 960, 1, 1), (960, 1, 1, 1)) | |
buf244 = buf242[1] | |
assert_size_stride(buf244, (80, 960, 1, 1), (960, 1, 1, 1)) | |
del buf242 | |
buf245 = empty_strided((80, 960, 1, 1), (960, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_45.run(buf244, buf245, 76800, grid=grid(76800), stream=stream0) | |
del buf244 | |
buf248 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32) | |
buf249 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32) | |
buf251 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_46.run(buf228, convolution_125, buf243, convert_element_type_316, cat_34, unsqueeze_366, squeeze_139, buf248, buf249, buf251, 960, 6272, grid=grid(960), stream=stream0) | |
buf250 = empty_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_47.run(buf228, convolution_125, buf243, convert_element_type_316, cat_34, unsqueeze_366, buf249, squeeze_139, buf248, buf250, 6021120, grid=grid(6021120), stream=stream0) | |
del buf228 | |
del buf243 | |
del cat_34 | |
del convert_element_type_316 | |
del convolution_125 | |
del unsqueeze_366 | |
buf252 = empty_strided((128, 240, 7, 7), (11760, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_convolution_backward_48.run(buf250, squeeze_139, primals_93, buf252, 1505280, grid=grid(1505280), stream=stream0) | |
buf253 = aten.convolution_backward(buf252, getitem_345, convert_element_type_314, [0], [2, 2], [4, 4], [1, 1], False, [0, 0], 240, [True, True, False]) | |
del convert_element_type_314 | |
del getitem_345 | |
buf254 = buf253[0] | |
assert_size_stride(buf254, (128, 240, 14, 14), (47040, 196, 14, 1)) | |
buf255 = buf253[1] | |
assert_size_stride(buf255, (240, 1, 9, 9), (81, 81, 9, 1)) | |
del buf253 | |
buf256 = empty_strided((240, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_49.run(buf255, buf256, 19440, grid=grid(19440), stream=stream0) | |
del buf255 | |
buf257 = buf252; del buf252 # reuse | |
triton_poi_fused_convolution_backward_50.run(buf250, squeeze_139, primals_93, buf257, 1505280, grid=grid(1505280), stream=stream0) | |
buf258 = aten.convolution_backward(buf257, getitem_340, convert_element_type_313, [0], [2, 2], [3, 3], [1, 1], False, [0, 0], 240, [True, True, False]) | |
del convert_element_type_313 | |
del getitem_340 | |
buf259 = buf258[0] | |
assert_size_stride(buf259, (128, 240, 14, 14), (47040, 196, 14, 1)) | |
buf260 = buf258[1] | |
assert_size_stride(buf260, (240, 1, 7, 7), (49, 49, 7, 1)) | |
del buf258 | |
buf261 = empty_strided((240, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_51.run(buf260, buf261, 11760, grid=grid(11760), stream=stream0) | |
del buf260 | |
buf262 = buf257; del buf257 # reuse | |
triton_poi_fused_convolution_backward_52.run(buf250, squeeze_139, primals_93, buf262, 1505280, grid=grid(1505280), stream=stream0) | |
buf263 = aten.convolution_backward(buf262, getitem_335, convert_element_type_312, [0], [2, 2], [2, 2], [1, 1], False, [0, 0], 240, [True, True, False]) | |
del convert_element_type_312 | |
del getitem_335 | |
buf264 = buf263[0] | |
assert_size_stride(buf264, (128, 240, 14, 14), (47040, 196, 14, 1)) | |
buf265 = buf263[1] | |
assert_size_stride(buf265, (240, 1, 5, 5), (25, 25, 5, 1)) | |
del buf263 | |
buf266 = empty_strided((240, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_53.run(buf265, buf266, 6000, grid=grid(6000), stream=stream0) | |
del buf265 | |
buf267 = buf262; del buf262 # reuse | |
triton_poi_fused_convolution_backward_54.run(buf250, squeeze_139, primals_93, buf267, 1505280, grid=grid(1505280), stream=stream0) | |
del buf250 | |
del primals_93 | |
del squeeze_139 | |
buf268 = aten.convolution_backward(buf267, getitem_330, convert_element_type_311, [0], [2, 2], [1, 1], [1, 1], False, [0, 0], 240, [True, True, False]) | |
del buf267 | |
del convert_element_type_311 | |
del getitem_330 | |
buf269 = buf268[0] | |
assert_size_stride(buf269, (128, 240, 14, 14), (47040, 196, 14, 1)) | |
buf270 = buf268[1] | |
assert_size_stride(buf270, (240, 1, 3, 3), (9, 9, 3, 1)) | |
del buf268 | |
buf271 = empty_strided((240, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_55.run(buf270, buf271, 2160, grid=grid(2160), stream=stream0) | |
del buf270 | |
buf276 = empty_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda', dtype=torch.float16) | |
buf272 = as_strided(buf276, (128, 240, 14, 14), (188160, 196, 14, 1)) # alias | |
triton_poi_fused_cat_56.run(buf269, buf272, 6021120, grid=grid(6021120), stream=stream0) | |
del buf269 | |
buf273 = as_strided(buf276, (128, 240, 14, 14), (188160, 196, 14, 1), 47040) # alias | |
triton_poi_fused_cat_56.run(buf264, buf273, 6021120, grid=grid(6021120), stream=stream0) | |
del buf264 | |
buf274 = as_strided(buf276, (128, 240, 14, 14), (188160, 196, 14, 1), 94080) # alias | |
triton_poi_fused_cat_56.run(buf259, buf274, 6021120, grid=grid(6021120), stream=stream0) | |
del buf259 | |
buf275 = as_strided(buf276, (128, 240, 14, 14), (188160, 196, 14, 1), 141120) # alias | |
triton_poi_fused_cat_56.run(buf254, buf275, 6021120, grid=grid(6021120), stream=stream0) | |
del buf254 | |
buf277 = buf249; del buf249 # reuse | |
buf278 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32) | |
buf279 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_57.run(buf276, mul_628, convolution_119, unsqueeze_378, squeeze_136, buf277, buf278, buf279, 960, 25088, grid=grid(960), stream=stream0) | |
del buf272 | |
del buf273 | |
del buf274 | |
del buf275 | |
buf280 = buf276; del buf276 # reuse | |
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_58.run(buf280, mul_628, convolution_119, unsqueeze_378, buf278, squeeze_136, buf277, primals_91, 24084480, grid=grid(24084480), stream=stream0) | |
del buf278 | |
del convolution_119 | |
del mul_628 | |
del primals_91 | |
del squeeze_136 | |
del unsqueeze_378 | |
buf281 = aten.convolution_backward(buf280, add_235, convert_element_type_306, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del add_235 | |
del buf280 | |
del convert_element_type_306 | |
buf282 = buf281[0] | |
assert_size_stride(buf282, (128, 160, 14, 14), (31360, 196, 14, 1)) | |
buf283 = buf281[1] | |
assert_size_stride(buf283, (960, 160, 1, 1), (160, 1, 1, 1)) | |
del buf281 | |
buf284 = empty_strided((960, 160, 1, 1), (160, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_59.run(buf283, buf284, 153600, grid=grid(153600), stream=stream0) | |
del buf283 | |
buf285 = empty_strided((160, 4), (1, 160), device='cuda', dtype=torch.float32) | |
buf287 = empty_strided((160, 4), (1, 160), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_60.run(buf282, cat_33, unsqueeze_390, buf285, buf287, 640, 6272, grid=grid(640), stream=stream0) | |
buf286 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused_native_batch_norm_backward_61.run(buf285, buf286, 160, 4, grid=grid(160), stream=stream0) | |
buf288 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32) | |
buf289 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_62.run(buf287, squeeze_133, buf288, buf289, 160, 4, grid=grid(160), stream=stream0) | |
buf290 = empty_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_63.run(buf282, cat_33, unsqueeze_390, buf288, squeeze_133, buf286, primals_89, buf290, 4014080, grid=grid(4014080), stream=stream0) | |
del cat_33 | |
del primals_89 | |
del squeeze_133 | |
del unsqueeze_390 | |
buf291 = aten.convolution_backward(as_strided(buf290, (128, 80, 14, 14), (31360, 196, 14, 1), 15680), getitem_321, convert_element_type_303, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del convert_element_type_303 | |
del getitem_321 | |
buf292 = buf291[0] | |
assert_size_stride(buf292, (128, 240, 14, 14), (47040, 196, 14, 1)) | |
buf293 = buf291[1] | |
assert_size_stride(buf293, (80, 240, 1, 1), (240, 1, 1, 1)) | |
del buf291 | |
buf294 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_64.run(buf293, buf294, 19200, grid=grid(19200), stream=stream0) | |
del buf293 | |
buf295 = aten.convolution_backward(as_strided(buf290, (128, 80, 14, 14), (31360, 196, 14, 1)), getitem_320, convert_element_type_302, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del convert_element_type_302 | |
del getitem_320 | |
buf296 = buf295[0] | |
assert_size_stride(buf296, (128, 240, 14, 14), (47040, 196, 14, 1)) | |
buf297 = buf295[1] | |
assert_size_stride(buf297, (80, 240, 1, 1), (240, 1, 1, 1)) | |
del buf295 | |
buf298 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_64.run(buf297, buf298, 19200, grid=grid(19200), stream=stream0) | |
del buf297 | |
buf301 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16) | |
buf299 = as_strided(buf301, (128, 240, 14, 14), (94080, 196, 14, 1)) # alias | |
triton_poi_fused_cat_65.run(buf296, buf299, 6021120, grid=grid(6021120), stream=stream0) | |
del buf296 | |
buf300 = as_strided(buf301, (128, 240, 14, 14), (94080, 196, 14, 1), 47040) # alias | |
triton_poi_fused_cat_65.run(buf292, buf300, 6021120, grid=grid(6021120), stream=stream0) | |
del buf292 | |
buf302 = empty_strided((128, 480, 1, 1), (480, 1, 61440, 61440), device='cuda', dtype=torch.float16) | |
buf303 = as_strided(buf302, (128, 480, 1, 1), (480, 1, 1, 1)); del buf302 # reuse | |
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_66.run(buf303, buf301, convert_element_type_293, convolution_116, 61440, 196, grid=grid(61440), stream=stream0) | |
del buf299 | |
del buf300 | |
buf309 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__to_copy_convolution_backward_67.run(buf303, buf309, 480, 128, grid=grid(480), stream=stream0) | |
buf305 = aten.convolution_backward(buf303, convert_element_type_299, convert_element_type_301, [480], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del buf303 | |
del convert_element_type_299 | |
del convert_element_type_301 | |
buf306 = buf305[0] | |
assert_size_stride(buf306, (128, 80, 1, 1), (80, 1, 1, 1)) | |
buf307 = buf305[1] | |
assert_size_stride(buf307, (480, 80, 1, 1), (80, 1, 1, 1)) | |
del buf305 | |
buf308 = empty_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_68.run(buf307, buf308, 38400, grid=grid(38400), stream=stream0) | |
del buf307 | |
buf311 = buf306; del buf306 # reuse | |
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_43.run(buf311, convolution_115, 10240, grid=grid(10240), stream=stream0) | |
del convolution_115 | |
buf317 = empty_strided((80, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__to_copy_convolution_backward_44.run(buf311, buf317, 80, 128, grid=grid(80), stream=stream0) | |
buf313 = aten.convolution_backward(buf311, mean_11, convert_element_type_297, [80], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del buf311 | |
del convert_element_type_297 | |
del mean_11 | |
buf314 = buf313[0] | |
assert_size_stride(buf314, (128, 480, 1, 1), (480, 1, 1, 1)) | |
buf315 = buf313[1] | |
assert_size_stride(buf315, (80, 480, 1, 1), (480, 1, 1, 1)) | |
del buf313 | |
buf316 = empty_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_69.run(buf315, buf316, 38400, grid=grid(38400), stream=stream0) | |
del buf315 | |
buf319 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
buf320 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
buf322 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_70.run(buf301, convolution_116, buf314, convert_element_type_293, cat_32, unsqueeze_402, squeeze_130, buf319, buf320, buf322, 480, 25088, grid=grid(480), stream=stream0) | |
buf321 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_71.run(buf301, convolution_116, buf314, convert_element_type_293, cat_32, unsqueeze_402, buf320, squeeze_130, buf319, buf321, 12042240, grid=grid(12042240), stream=stream0) | |
del cat_32 | |
del convert_element_type_293 | |
del convolution_116 | |
del unsqueeze_402 | |
buf323 = empty_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_convolution_backward_72.run(buf321, squeeze_130, primals_87, buf323, 3010560, grid=grid(3010560), stream=stream0) | |
buf324 = aten.convolution_backward(buf323, getitem_317, convert_element_type_291, [0], [1, 1], [4, 4], [1, 1], False, [0, 0], 120, [True, True, False]) | |
del convert_element_type_291 | |
del getitem_317 | |
buf325 = buf324[0] | |
assert_size_stride(buf325, (128, 120, 14, 14), (23520, 196, 14, 1)) | |
buf326 = buf324[1] | |
assert_size_stride(buf326, (120, 1, 9, 9), (81, 81, 9, 1)) | |
del buf324 | |
buf327 = empty_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_73.run(buf326, buf327, 9720, grid=grid(9720), stream=stream0) | |
del buf326 | |
buf328 = buf323; del buf323 # reuse | |
triton_poi_fused_convolution_backward_74.run(buf321, squeeze_130, primals_87, buf328, 3010560, grid=grid(3010560), stream=stream0) | |
buf329 = aten.convolution_backward(buf328, getitem_312, convert_element_type_290, [0], [1, 1], [3, 3], [1, 1], False, [0, 0], 120, [True, True, False]) | |
del convert_element_type_290 | |
del getitem_312 | |
buf330 = buf329[0] | |
assert_size_stride(buf330, (128, 120, 14, 14), (23520, 196, 14, 1)) | |
buf331 = buf329[1] | |
assert_size_stride(buf331, (120, 1, 7, 7), (49, 49, 7, 1)) | |
del buf329 | |
buf332 = empty_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_75.run(buf331, buf332, 5880, grid=grid(5880), stream=stream0) | |
del buf331 | |
buf333 = buf328; del buf328 # reuse | |
triton_poi_fused_convolution_backward_76.run(buf321, squeeze_130, primals_87, buf333, 3010560, grid=grid(3010560), stream=stream0) | |
buf334 = aten.convolution_backward(buf333, getitem_307, convert_element_type_289, [0], [1, 1], [2, 2], [1, 1], False, [0, 0], 120, [True, True, False]) | |
del convert_element_type_289 | |
del getitem_307 | |
buf335 = buf334[0] | |
assert_size_stride(buf335, (128, 120, 14, 14), (23520, 196, 14, 1)) | |
buf336 = buf334[1] | |
assert_size_stride(buf336, (120, 1, 5, 5), (25, 25, 5, 1)) | |
del buf334 | |
buf337 = empty_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_77.run(buf336, buf337, 3000, grid=grid(3000), stream=stream0) | |
del buf336 | |
buf338 = buf333; del buf333 # reuse | |
triton_poi_fused_convolution_backward_78.run(buf321, squeeze_130, primals_87, buf338, 3010560, grid=grid(3010560), stream=stream0) | |
del primals_87 | |
del squeeze_130 | |
buf339 = aten.convolution_backward(buf338, getitem_302, convert_element_type_288, [0], [1, 1], [1, 1], [1, 1], False, [0, 0], 120, [True, True, False]) | |
del buf338 | |
del convert_element_type_288 | |
del getitem_302 | |
buf340 = buf339[0] | |
assert_size_stride(buf340, (128, 120, 14, 14), (23520, 196, 14, 1)) | |
buf341 = buf339[1] | |
assert_size_stride(buf341, (120, 1, 3, 3), (9, 9, 3, 1)) | |
del buf339 | |
buf342 = empty_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_79.run(buf341, buf342, 1080, grid=grid(1080), stream=stream0) | |
del buf341 | |
buf347 = buf301; del buf301 # reuse | |
buf343 = as_strided(buf347, (128, 120, 14, 14), (94080, 196, 14, 1)) # alias | |
triton_poi_fused_cat_80.run(buf340, buf343, 3010560, grid=grid(3010560), stream=stream0) | |
del buf340 | |
buf344 = as_strided(buf347, (128, 120, 14, 14), (94080, 196, 14, 1), 23520) # alias | |
triton_poi_fused_cat_80.run(buf335, buf344, 3010560, grid=grid(3010560), stream=stream0) | |
del buf335 | |
buf345 = as_strided(buf347, (128, 120, 14, 14), (94080, 196, 14, 1), 47040) # alias | |
triton_poi_fused_cat_80.run(buf330, buf345, 3010560, grid=grid(3010560), stream=stream0) | |
del buf330 | |
buf346 = as_strided(buf347, (128, 120, 14, 14), (94080, 196, 14, 1), 70560) # alias | |
triton_poi_fused_cat_80.run(buf325, buf346, 3010560, grid=grid(3010560), stream=stream0) | |
buf348 = buf320; del buf320 # reuse | |
buf349 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
buf350 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_81.run(buf347, mul_668, cat_31, unsqueeze_414, squeeze_127, buf348, buf349, buf350, 480, 25088, grid=grid(480), stream=stream0) | |
del buf343 | |
del buf344 | |
del buf345 | |
del buf346 | |
buf351 = buf347; del buf347 # reuse | |
triton_poi_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_82.run(buf351, mul_668, cat_31, unsqueeze_414, buf349, squeeze_127, buf348, primals_85, 12042240, grid=grid(12042240), stream=stream0) | |
del cat_31 | |
del mul_668 | |
del primals_85 | |
del squeeze_127 | |
del unsqueeze_414 | |
buf352 = aten.convolution_backward(as_strided(buf351, (128, 240, 14, 14), (94080, 196, 14, 1), 47040), getitem_295, convert_element_type_283, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del convert_element_type_283 | |
del getitem_295 | |
buf353 = buf352[0] | |
assert_size_stride(buf353, (128, 80, 14, 14), (15680, 196, 14, 1)) | |
buf354 = buf352[1] | |
assert_size_stride(buf354, (240, 80, 1, 1), (80, 1, 1, 1)) | |
del buf352 | |
buf355 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_83.run(buf354, buf355, 19200, grid=grid(19200), stream=stream0) | |
del buf354 | |
buf356 = aten.convolution_backward(as_strided(buf351, (128, 240, 14, 14), (94080, 196, 14, 1)), getitem_294, convert_element_type_282, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del convert_element_type_282 | |
del getitem_294 | |
buf357 = buf356[0] | |
assert_size_stride(buf357, (128, 80, 14, 14), (15680, 196, 14, 1)) | |
buf358 = buf356[1] | |
assert_size_stride(buf358, (240, 80, 1, 1), (80, 1, 1, 1)) | |
del buf356 | |
buf359 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_83.run(buf358, buf359, 19200, grid=grid(19200), stream=stream0) | |
del buf358 | |
buf362 = buf290; del buf290 # reuse | |
buf360 = as_strided(buf362, (128, 80, 14, 14), (31360, 196, 14, 1)) # alias | |
triton_poi_fused_cat_84.run(buf357, buf360, 2007040, grid=grid(2007040), stream=stream0) | |
del buf357 | |
buf361 = as_strided(buf362, (128, 80, 14, 14), (31360, 196, 14, 1), 15680) # alias | |
triton_poi_fused_cat_84.run(buf353, buf361, 2007040, grid=grid(2007040), stream=stream0) | |
del buf353 | |
buf363 = buf287; del buf287 # reuse | |
buf365 = buf285; del buf285 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_85.run(buf282, buf362, cat_30, unsqueeze_426, buf363, buf365, 640, 6272, grid=grid(640), stream=stream0) | |
del buf360 | |
del buf361 | |
buf364 = buf288; del buf288 # reuse | |
triton_per_fused_native_batch_norm_backward_61.run(buf363, buf364, 160, 4, grid=grid(160), stream=stream0) | |
buf366 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32) | |
buf367 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_62.run(buf365, squeeze_124, buf366, buf367, 160, 4, grid=grid(160), stream=stream0) | |
buf368 = empty_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_86.run(buf282, buf362, cat_30, unsqueeze_426, buf366, squeeze_124, buf364, primals_83, buf368, 4014080, grid=grid(4014080), stream=stream0) | |
del cat_30 | |
del primals_83 | |
del squeeze_124 | |
del unsqueeze_426 | |
buf369 = aten.convolution_backward(as_strided(buf368, (128, 80, 14, 14), (31360, 196, 14, 1), 15680), getitem_291, convert_element_type_279, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del convert_element_type_279 | |
del getitem_291 | |
buf370 = buf369[0] | |
assert_size_stride(buf370, (128, 240, 14, 14), (47040, 196, 14, 1)) | |
buf371 = buf369[1] | |
assert_size_stride(buf371, (80, 240, 1, 1), (240, 1, 1, 1)) | |
del buf369 | |
buf372 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_64.run(buf371, buf372, 19200, grid=grid(19200), stream=stream0) | |
del buf371 | |
buf373 = aten.convolution_backward(as_strided(buf368, (128, 80, 14, 14), (31360, 196, 14, 1)), getitem_290, convert_element_type_278, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del convert_element_type_278 | |
del getitem_290 | |
buf374 = buf373[0] | |
assert_size_stride(buf374, (128, 240, 14, 14), (47040, 196, 14, 1)) | |
buf375 = buf373[1] | |
assert_size_stride(buf375, (80, 240, 1, 1), (240, 1, 1, 1)) | |
del buf373 | |
buf376 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_64.run(buf375, buf376, 19200, grid=grid(19200), stream=stream0) | |
del buf375 | |
buf379 = buf351; del buf351 # reuse | |
buf377 = as_strided(buf379, (128, 240, 14, 14), (94080, 196, 14, 1)) # alias | |
triton_poi_fused_cat_65.run(buf374, buf377, 6021120, grid=grid(6021120), stream=stream0) | |
del buf374 | |
buf378 = as_strided(buf379, (128, 240, 14, 14), (94080, 196, 14, 1), 47040) # alias | |
triton_poi_fused_cat_65.run(buf370, buf378, 6021120, grid=grid(6021120), stream=stream0) | |
del buf370 | |
buf380 = as_strided(buf314, (128, 480, 1, 1), (480, 1, 61440, 61440)); del buf314 # reuse | |
buf381 = as_strided(buf380, (128, 480, 1, 1), (480, 1, 1, 1)); del buf380 # reuse | |
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_66.run(buf381, buf379, convert_element_type_269, convolution_106, 61440, 196, grid=grid(61440), stream=stream0) | |
del buf377 | |
del buf378 | |
buf387 = buf349; del buf349 # reuse | |
triton_per_fused__to_copy_convolution_backward_67.run(buf381, buf387, 480, 128, grid=grid(480), stream=stream0) | |
buf383 = aten.convolution_backward(buf381, convert_element_type_275, convert_element_type_277, [480], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del buf381 | |
del convert_element_type_275 | |
del convert_element_type_277 | |
buf384 = buf383[0] | |
assert_size_stride(buf384, (128, 80, 1, 1), (80, 1, 1, 1)) | |
buf385 = buf383[1] | |
assert_size_stride(buf385, (480, 80, 1, 1), (80, 1, 1, 1)) | |
del buf383 | |
buf386 = empty_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_68.run(buf385, buf386, 38400, grid=grid(38400), stream=stream0) | |
del buf385 | |
buf389 = buf384; del buf384 # reuse | |
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_43.run(buf389, convolution_105, 10240, grid=grid(10240), stream=stream0) | |
del convolution_105 | |
buf395 = empty_strided((80, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__to_copy_convolution_backward_44.run(buf389, buf395, 80, 128, grid=grid(80), stream=stream0) | |
buf391 = aten.convolution_backward(buf389, mean_10, convert_element_type_273, [80], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del buf389 | |
del convert_element_type_273 | |
del mean_10 | |
buf392 = buf391[0] | |
assert_size_stride(buf392, (128, 480, 1, 1), (480, 1, 1, 1)) | |
buf393 = buf391[1] | |
assert_size_stride(buf393, (80, 480, 1, 1), (480, 1, 1, 1)) | |
del buf391 | |
buf394 = empty_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_69.run(buf393, buf394, 38400, grid=grid(38400), stream=stream0) | |
del buf393 | |
buf397 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
buf398 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
buf400 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_70.run(buf379, convolution_106, buf392, convert_element_type_269, cat_29, unsqueeze_438, squeeze_121, buf397, buf398, buf400, 480, 25088, grid=grid(480), stream=stream0) | |
buf399 = buf321; del buf321 # reuse | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_71.run(buf379, convolution_106, buf392, convert_element_type_269, cat_29, unsqueeze_438, buf398, squeeze_121, buf397, buf399, 12042240, grid=grid(12042240), stream=stream0) | |
del cat_29 | |
del convert_element_type_269 | |
del convolution_106 | |
del unsqueeze_438 | |
buf401 = buf325; del buf325 # reuse | |
triton_poi_fused_convolution_backward_72.run(buf399, squeeze_121, primals_81, buf401, 3010560, grid=grid(3010560), stream=stream0) | |
buf402 = aten.convolution_backward(buf401, getitem_287, convert_element_type_267, [0], [1, 1], [4, 4], [1, 1], False, [0, 0], 120, [True, True, False]) | |
del convert_element_type_267 | |
del getitem_287 | |
buf403 = buf402[0] | |
assert_size_stride(buf403, (128, 120, 14, 14), (23520, 196, 14, 1)) | |
buf404 = buf402[1] | |
assert_size_stride(buf404, (120, 1, 9, 9), (81, 81, 9, 1)) | |
del buf402 | |
buf405 = empty_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_73.run(buf404, buf405, 9720, grid=grid(9720), stream=stream0) | |
del buf404 | |
buf406 = buf401; del buf401 # reuse | |
triton_poi_fused_convolution_backward_74.run(buf399, squeeze_121, primals_81, buf406, 3010560, grid=grid(3010560), stream=stream0) | |
buf407 = aten.convolution_backward(buf406, getitem_282, convert_element_type_266, [0], [1, 1], [3, 3], [1, 1], False, [0, 0], 120, [True, True, False]) | |
del convert_element_type_266 | |
del getitem_282 | |
buf408 = buf407[0] | |
assert_size_stride(buf408, (128, 120, 14, 14), (23520, 196, 14, 1)) | |
buf409 = buf407[1] | |
assert_size_stride(buf409, (120, 1, 7, 7), (49, 49, 7, 1)) | |
del buf407 | |
buf410 = empty_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_75.run(buf409, buf410, 5880, grid=grid(5880), stream=stream0) | |
del buf409 | |
buf411 = buf406; del buf406 # reuse | |
triton_poi_fused_convolution_backward_76.run(buf399, squeeze_121, primals_81, buf411, 3010560, grid=grid(3010560), stream=stream0) | |
buf412 = aten.convolution_backward(buf411, getitem_277, convert_element_type_265, [0], [1, 1], [2, 2], [1, 1], False, [0, 0], 120, [True, True, False]) | |
del convert_element_type_265 | |
del getitem_277 | |
buf413 = buf412[0] | |
assert_size_stride(buf413, (128, 120, 14, 14), (23520, 196, 14, 1)) | |
buf414 = buf412[1] | |
assert_size_stride(buf414, (120, 1, 5, 5), (25, 25, 5, 1)) | |
del buf412 | |
buf415 = empty_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_77.run(buf414, buf415, 3000, grid=grid(3000), stream=stream0) | |
del buf414 | |
buf416 = buf411; del buf411 # reuse | |
triton_poi_fused_convolution_backward_78.run(buf399, squeeze_121, primals_81, buf416, 3010560, grid=grid(3010560), stream=stream0) | |
del primals_81 | |
del squeeze_121 | |
buf417 = aten.convolution_backward(buf416, getitem_272, convert_element_type_264, [0], [1, 1], [1, 1], [1, 1], False, [0, 0], 120, [True, True, False]) | |
del buf416 | |
del convert_element_type_264 | |
del getitem_272 | |
buf418 = buf417[0] | |
assert_size_stride(buf418, (128, 120, 14, 14), (23520, 196, 14, 1)) | |
buf419 = buf417[1] | |
assert_size_stride(buf419, (120, 1, 3, 3), (9, 9, 3, 1)) | |
del buf417 | |
buf420 = empty_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_79.run(buf419, buf420, 1080, grid=grid(1080), stream=stream0) | |
del buf419 | |
buf425 = buf379; del buf379 # reuse | |
buf421 = as_strided(buf425, (128, 120, 14, 14), (94080, 196, 14, 1)) # alias | |
triton_poi_fused_cat_80.run(buf418, buf421, 3010560, grid=grid(3010560), stream=stream0) | |
del buf418 | |
buf422 = as_strided(buf425, (128, 120, 14, 14), (94080, 196, 14, 1), 23520) # alias | |
triton_poi_fused_cat_80.run(buf413, buf422, 3010560, grid=grid(3010560), stream=stream0) | |
del buf413 | |
buf423 = as_strided(buf425, (128, 120, 14, 14), (94080, 196, 14, 1), 47040) # alias | |
triton_poi_fused_cat_80.run(buf408, buf423, 3010560, grid=grid(3010560), stream=stream0) | |
del buf408 | |
buf424 = as_strided(buf425, (128, 120, 14, 14), (94080, 196, 14, 1), 70560) # alias | |
triton_poi_fused_cat_80.run(buf403, buf424, 3010560, grid=grid(3010560), stream=stream0) | |
buf426 = buf398; del buf398 # reuse | |
buf427 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
buf428 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_81.run(buf425, mul_708, cat_28, unsqueeze_450, squeeze_118, buf426, buf427, buf428, 480, 25088, grid=grid(480), stream=stream0) | |
del buf421 | |
del buf422 | |
del buf423 | |
del buf424 | |
buf429 = buf425; del buf425 # reuse | |
triton_poi_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_82.run(buf429, mul_708, cat_28, unsqueeze_450, buf427, squeeze_118, buf426, primals_79, 12042240, grid=grid(12042240), stream=stream0) | |
del cat_28 | |
del mul_708 | |
del primals_79 | |
del squeeze_118 | |
del unsqueeze_450 | |
buf430 = aten.convolution_backward(as_strided(buf429, (128, 240, 14, 14), (94080, 196, 14, 1), 47040), getitem_265, convert_element_type_259, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del convert_element_type_259 | |
del getitem_265 | |
buf431 = buf430[0] | |
assert_size_stride(buf431, (128, 80, 14, 14), (15680, 196, 14, 1)) | |
buf432 = buf430[1] | |
assert_size_stride(buf432, (240, 80, 1, 1), (80, 1, 1, 1)) | |
del buf430 | |
buf433 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_83.run(buf432, buf433, 19200, grid=grid(19200), stream=stream0) | |
del buf432 | |
buf434 = aten.convolution_backward(as_strided(buf429, (128, 240, 14, 14), (94080, 196, 14, 1)), getitem_264, convert_element_type_258, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del convert_element_type_258 | |
del getitem_264 | |
buf435 = buf434[0] | |
assert_size_stride(buf435, (128, 80, 14, 14), (15680, 196, 14, 1)) | |
buf436 = buf434[1] | |
assert_size_stride(buf436, (240, 80, 1, 1), (80, 1, 1, 1)) | |
del buf434 | |
buf437 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_83.run(buf436, buf437, 19200, grid=grid(19200), stream=stream0) | |
del buf436 | |
buf440 = buf368; del buf368 # reuse | |
buf438 = as_strided(buf440, (128, 80, 14, 14), (31360, 196, 14, 1)) # alias | |
triton_poi_fused_cat_84.run(buf435, buf438, 2007040, grid=grid(2007040), stream=stream0) | |
del buf435 | |
buf439 = as_strided(buf440, (128, 80, 14, 14), (31360, 196, 14, 1), 15680) # alias | |
triton_poi_fused_cat_84.run(buf431, buf439, 2007040, grid=grid(2007040), stream=stream0) | |
buf441 = buf365; del buf365 # reuse | |
buf443 = buf363; del buf363 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_87.run(buf282, buf362, buf440, cat_27, unsqueeze_462, buf441, buf443, 640, 6272, grid=grid(640), stream=stream0) | |
del buf438 | |
del buf439 | |
buf442 = buf366; del buf366 # reuse | |
triton_per_fused_native_batch_norm_backward_61.run(buf441, buf442, 160, 4, grid=grid(160), stream=stream0) | |
buf444 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32) | |
buf446 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_62.run(buf443, squeeze_115, buf444, buf446, 160, 4, grid=grid(160), stream=stream0) | |
buf445 = empty_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_88.run(buf282, buf362, buf440, cat_27, unsqueeze_462, buf444, squeeze_115, buf442, primals_77, buf445, 4014080, grid=grid(4014080), stream=stream0) | |
del cat_27 | |
del primals_77 | |
del squeeze_115 | |
del unsqueeze_462 | |
buf447 = buf431; del buf431 # reuse | |
triton_poi_fused_convolution_backward_89.run(buf445, buf447, 2007040, grid=grid(2007040), stream=stream0) | |
buf448 = aten.convolution_backward(buf447, getitem_261, convert_element_type_255, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del convert_element_type_255 | |
del getitem_261 | |
buf449 = buf448[0] | |
assert_size_stride(buf449, (128, 240, 14, 14), (47040, 196, 14, 1)) | |
buf450 = buf448[1] | |
assert_size_stride(buf450, (80, 240, 1, 1), (240, 1, 1, 1)) | |
del buf448 | |
buf451 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_64.run(buf450, buf451, 19200, grid=grid(19200), stream=stream0) | |
del buf450 | |
buf452 = buf447; del buf447 # reuse | |
triton_poi_fused_convolution_backward_90.run(buf445, buf452, 2007040, grid=grid(2007040), stream=stream0) | |
del buf445 | |
buf453 = aten.convolution_backward(buf452, getitem_260, convert_element_type_254, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del buf452 | |
del convert_element_type_254 | |
del getitem_260 | |
buf454 = buf453[0] | |
assert_size_stride(buf454, (128, 240, 14, 14), (47040, 196, 14, 1)) | |
buf455 = buf453[1] | |
assert_size_stride(buf455, (80, 240, 1, 1), (240, 1, 1, 1)) | |
del buf453 | |
buf456 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_64.run(buf455, buf456, 19200, grid=grid(19200), stream=stream0) | |
del buf455 | |
buf459 = buf429; del buf429 # reuse | |
buf457 = as_strided(buf459, (128, 240, 14, 14), (94080, 196, 14, 1)) # alias | |
triton_poi_fused_cat_65.run(buf454, buf457, 6021120, grid=grid(6021120), stream=stream0) | |
del buf454 | |
buf458 = as_strided(buf459, (128, 240, 14, 14), (94080, 196, 14, 1), 47040) # alias | |
triton_poi_fused_cat_65.run(buf449, buf458, 6021120, grid=grid(6021120), stream=stream0) | |
buf460 = as_strided(buf392, (128, 480, 1, 1), (480, 1, 61440, 61440)); del buf392 # reuse | |
buf461 = as_strided(buf460, (128, 480, 1, 1), (480, 1, 1, 1)); del buf460 # reuse | |
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_66.run(buf461, buf459, convert_element_type_245, convolution_96, 61440, 196, grid=grid(61440), stream=stream0) | |
del buf457 | |
del buf458 | |
buf467 = buf427; del buf427 # reuse | |
triton_per_fused__to_copy_convolution_backward_67.run(buf461, buf467, 480, 128, grid=grid(480), stream=stream0) | |
buf463 = aten.convolution_backward(buf461, convert_element_type_251, convert_element_type_253, [480], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del buf461 | |
del convert_element_type_251 | |
del convert_element_type_253 | |
buf464 = buf463[0] | |
assert_size_stride(buf464, (128, 80, 1, 1), (80, 1, 1, 1)) | |
buf465 = buf463[1] | |
assert_size_stride(buf465, (480, 80, 1, 1), (80, 1, 1, 1)) | |
del buf463 | |
buf466 = empty_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_68.run(buf465, buf466, 38400, grid=grid(38400), stream=stream0) | |
del buf465 | |
buf469 = buf464; del buf464 # reuse | |
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_43.run(buf469, convolution_95, 10240, grid=grid(10240), stream=stream0) | |
del convolution_95 | |
buf475 = empty_strided((80, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__to_copy_convolution_backward_44.run(buf469, buf475, 80, 128, grid=grid(80), stream=stream0) | |
buf471 = aten.convolution_backward(buf469, mean_9, convert_element_type_249, [80], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del buf469 | |
del convert_element_type_249 | |
del mean_9 | |
buf472 = buf471[0] | |
assert_size_stride(buf472, (128, 480, 1, 1), (480, 1, 1, 1)) | |
buf473 = buf471[1] | |
assert_size_stride(buf473, (80, 480, 1, 1), (480, 1, 1, 1)) | |
del buf471 | |
buf474 = empty_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_69.run(buf473, buf474, 38400, grid=grid(38400), stream=stream0) | |
del buf473 | |
buf477 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
buf478 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
buf480 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_70.run(buf459, convolution_96, buf472, convert_element_type_245, cat_26, unsqueeze_474, squeeze_112, buf477, buf478, buf480, 480, 25088, grid=grid(480), stream=stream0) | |
buf479 = buf399; del buf399 # reuse | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_71.run(buf459, convolution_96, buf472, convert_element_type_245, cat_26, unsqueeze_474, buf478, squeeze_112, buf477, buf479, 12042240, grid=grid(12042240), stream=stream0) | |
del buf472 | |
del cat_26 | |
del convert_element_type_245 | |
del convolution_96 | |
del unsqueeze_474 | |
buf481 = buf403; del buf403 # reuse | |
triton_poi_fused_convolution_backward_72.run(buf479, squeeze_112, primals_75, buf481, 3010560, grid=grid(3010560), stream=stream0) | |
buf482 = aten.convolution_backward(buf481, getitem_257, convert_element_type_243, [0], [1, 1], [4, 4], [1, 1], False, [0, 0], 120, [True, True, False]) | |
del convert_element_type_243 | |
del getitem_257 | |
buf483 = buf482[0] | |
assert_size_stride(buf483, (128, 120, 14, 14), (23520, 196, 14, 1)) | |
buf484 = buf482[1] | |
assert_size_stride(buf484, (120, 1, 9, 9), (81, 81, 9, 1)) | |
del buf482 | |
buf485 = empty_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_73.run(buf484, buf485, 9720, grid=grid(9720), stream=stream0) | |
del buf484 | |
buf486 = buf481; del buf481 # reuse | |
triton_poi_fused_convolution_backward_74.run(buf479, squeeze_112, primals_75, buf486, 3010560, grid=grid(3010560), stream=stream0) | |
buf487 = aten.convolution_backward(buf486, getitem_252, convert_element_type_242, [0], [1, 1], [3, 3], [1, 1], False, [0, 0], 120, [True, True, False]) | |
del convert_element_type_242 | |
del getitem_252 | |
buf488 = buf487[0] | |
assert_size_stride(buf488, (128, 120, 14, 14), (23520, 196, 14, 1)) | |
buf489 = buf487[1] | |
assert_size_stride(buf489, (120, 1, 7, 7), (49, 49, 7, 1)) | |
del buf487 | |
buf490 = empty_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_75.run(buf489, buf490, 5880, grid=grid(5880), stream=stream0) | |
del buf489 | |
buf491 = buf486; del buf486 # reuse | |
triton_poi_fused_convolution_backward_76.run(buf479, squeeze_112, primals_75, buf491, 3010560, grid=grid(3010560), stream=stream0) | |
buf492 = aten.convolution_backward(buf491, getitem_247, convert_element_type_241, [0], [1, 1], [2, 2], [1, 1], False, [0, 0], 120, [True, True, False]) | |
del convert_element_type_241 | |
del getitem_247 | |
buf493 = buf492[0] | |
assert_size_stride(buf493, (128, 120, 14, 14), (23520, 196, 14, 1)) | |
buf494 = buf492[1] | |
assert_size_stride(buf494, (120, 1, 5, 5), (25, 25, 5, 1)) | |
del buf492 | |
buf495 = empty_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_77.run(buf494, buf495, 3000, grid=grid(3000), stream=stream0) | |
del buf494 | |
buf496 = buf491; del buf491 # reuse | |
triton_poi_fused_convolution_backward_78.run(buf479, squeeze_112, primals_75, buf496, 3010560, grid=grid(3010560), stream=stream0) | |
del buf479 | |
del primals_75 | |
del squeeze_112 | |
buf497 = aten.convolution_backward(buf496, getitem_242, convert_element_type_240, [0], [1, 1], [1, 1], [1, 1], False, [0, 0], 120, [True, True, False]) | |
del buf496 | |
del convert_element_type_240 | |
del getitem_242 | |
buf498 = buf497[0] | |
assert_size_stride(buf498, (128, 120, 14, 14), (23520, 196, 14, 1)) | |
buf499 = buf497[1] | |
assert_size_stride(buf499, (120, 1, 3, 3), (9, 9, 3, 1)) | |
del buf497 | |
buf500 = empty_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_79.run(buf499, buf500, 1080, grid=grid(1080), stream=stream0) | |
del buf499 | |
buf505 = buf459; del buf459 # reuse | |
buf501 = as_strided(buf505, (128, 120, 14, 14), (94080, 196, 14, 1)) # alias | |
triton_poi_fused_cat_80.run(buf498, buf501, 3010560, grid=grid(3010560), stream=stream0) | |
del buf498 | |
buf502 = as_strided(buf505, (128, 120, 14, 14), (94080, 196, 14, 1), 23520) # alias | |
triton_poi_fused_cat_80.run(buf493, buf502, 3010560, grid=grid(3010560), stream=stream0) | |
del buf493 | |
buf503 = as_strided(buf505, (128, 120, 14, 14), (94080, 196, 14, 1), 47040) # alias | |
triton_poi_fused_cat_80.run(buf488, buf503, 3010560, grid=grid(3010560), stream=stream0) | |
del buf488 | |
buf504 = as_strided(buf505, (128, 120, 14, 14), (94080, 196, 14, 1), 70560) # alias | |
triton_poi_fused_cat_80.run(buf483, buf504, 3010560, grid=grid(3010560), stream=stream0) | |
del buf483 | |
buf506 = buf478; del buf478 # reuse | |
buf507 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
buf508 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_81.run(buf505, mul_748, cat_25, unsqueeze_486, squeeze_109, buf506, buf507, buf508, 480, 25088, grid=grid(480), stream=stream0) | |
del buf501 | |
del buf502 | |
del buf503 | |
del buf504 | |
buf509 = buf505; del buf505 # reuse | |
triton_poi_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_82.run(buf509, mul_748, cat_25, unsqueeze_486, buf507, squeeze_109, buf506, primals_73, 12042240, grid=grid(12042240), stream=stream0) | |
del cat_25 | |
del mul_748 | |
del primals_73 | |
del squeeze_109 | |
del unsqueeze_486 | |
buf510 = aten.convolution_backward(as_strided(buf509, (128, 240, 14, 14), (94080, 196, 14, 1), 47040), getitem_235, convert_element_type_235, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del convert_element_type_235 | |
del getitem_235 | |
buf511 = buf510[0] | |
assert_size_stride(buf511, (128, 80, 14, 14), (15680, 196, 14, 1)) | |
buf512 = buf510[1] | |
assert_size_stride(buf512, (240, 80, 1, 1), (80, 1, 1, 1)) | |
del buf510 | |
buf513 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_83.run(buf512, buf513, 19200, grid=grid(19200), stream=stream0) | |
del buf512 | |
buf514 = aten.convolution_backward(as_strided(buf509, (128, 240, 14, 14), (94080, 196, 14, 1)), getitem_234, convert_element_type_234, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]) | |
del buf509 | |
del convert_element_type_234 | |
del getitem_234 | |
buf515 = buf514[0] | |
assert_size_stride(buf515, (128, 80, 14, 14), (15680, 196, 14, 1)) | |
buf516 = buf514[1] | |
assert_size_stride(buf516, (240, 80, 1, 1), (80, 1, 1, 1)) | |
del buf514 | |
buf517 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float32) | |
triton_poi_fused__to_copy_83.run(buf516, buf517, 19200, grid=grid(19200), stream=stream0) | |
del buf516 | |
buf520 = empty_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda', dtype=torch.float16) | |
buf518 = as_strided(buf520, (128, 80, 14, 14), (31360, 196, 14, 1)) # alias | |
triton_poi_fused_cat_84.run(buf515, buf518, 2007040, grid=grid(2007040), stream=stream0) | |
del buf515 | |
buf519 = as_strided(buf520, (128, 80, 14, 14), (31360, 196, 14, 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment