Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save shunting314/48efc83b12ec3ead950052e4a0220b10 to your computer and use it in GitHub Desktop.

Select an option

Save shunting314/48efc83b12ec3ead950052e4a0220b10 to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from torch._inductor.utils import maybe_profile
from torch import empty_strided, as_strided, device
from torch._inductor.codecache import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
aten = torch.ops.aten
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
async_compile = AsyncCompile()
import triton
import triton.language as tl
from torch._inductor.triton_heuristics import grid, start_graph, end_graph
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
# kernel path: /tmp/torchinductor_shunting/qd/cqdbopslmbarnc23s33ybyghfqf2olis2okrueqdtwzk66npkoci.py
# Original ATen: aten._to_copy, aten.sum
# aten._to_copy => convert_element_type_403
# aten.sum => sum_1
triton_red_fused__to_copy_sum_0 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 128],
reduction_hint=ReductionHint.OUTER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 3), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__to_copy_sum_0(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 1000
rnumel = 128
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp1 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex
tmp0 = tl.load(in_ptr0 + (x0 + (1000*r1)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
_tmp1 = tl.where(rmask & xmask, _tmp1 + tmp0, _tmp1)
tmp1 = tl.sum(_tmp1, 1)[:, None]
tmp2 = tmp1.to(tl.float32)
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp2, xmask)
def get_args():
arg_0 = rand_strided((128, 1000), (1000, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1000,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__to_copy_sum_0.run(*args, 1000, 128, grid=grid(1000), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__to_copy_sum_0.benchmark_all_configs(*args, 1000, 128, grid=grid(1000))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/v4/cv4lm4gdpfxwjwbfwkrkjjepglxb33ctw5byvwc2ujd3ynyqgfob.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_402
triton_poi_fused__to_copy_1 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_1(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1536000
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None)
def get_args():
arg_0 = rand_strided((1000, 1536), (1536, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1000, 1536), (1536, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_1.run(*args, 1536000, grid=grid(1536000), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_1.benchmark_all_configs(*args, 1536000, grid=grid(1536000))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/gw/cgw4eilm5jaevvb4jlwkfuyolkwuloinsrcuxsoiruu5pucw2qoo.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.div, aten.native_batch_norm_backward, aten.threshold_backward
# aten._native_batch_norm_legit_functional => convert_element_type_398
# aten.div => div
# aten.native_batch_norm_backward => convert_element_type_404, mul_470, mul_478, sub_58, sum_2, sum_3
# aten.threshold_backward => scalar_tensor, where
triton_red_fused__native_batch_norm_legit_functional_div_native_batch_norm_backward_threshold_backward_2 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[2048, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*i1', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_div_native_batch_norm_backward_threshold_backward_2(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 1536
rnumel = 6272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp7 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
tmp10 = tl.load(in_ptr3 + (x0), xmask)
_tmp13 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 49
r2 = (rindex // 49)
tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (75264*r2)), rmask & xmask, eviction_policy='evict_last')
tmp2 = tl.load(in_ptr1 + (x0 + (1536*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp8 = tl.load(in_ptr2 + (r1 + (49*x0) + (75264*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = 0.0
tmp3 = 49.0
tmp4 = tmp2 / tmp3
tmp5 = tl.where(tmp0, tmp1, tmp4)
tmp6 = tmp5.to(tl.float32)
_tmp7 = tl.where(rmask & xmask, _tmp7 + tmp6, _tmp7)
tmp9 = tmp8.to(tl.float32)
tmp11 = tmp9 - tmp10
tmp12 = tmp6 * tmp11
_tmp13 = tl.where(rmask & xmask, _tmp13 + tmp12, _tmp13)
tmp7 = tl.sum(_tmp7, 1)[:, None]
tl.store(out_ptr0 + x0, tmp7, xmask)
tmp13 = tl.sum(_tmp13, 1)[:, None]
tl.store(out_ptr1 + x0, tmp13, xmask)
tmp14 = tl.load(in_ptr4 + (x0), xmask)
tmp15 = tmp13 * tmp14
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp15, xmask)
def get_args():
arg_0 = rand_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda:0', dtype=torch.bool)
arg_1 = rand_strided((128, 1536), (1536, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((1, 1536, 1, 1), (1536, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_div_native_batch_norm_backward_threshold_backward_2.run(*args, 1536, 6272, grid=grid(1536), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_div_native_batch_norm_backward_threshold_backward_2.benchmark_all_configs(*args, 1536, 6272, grid=grid(1536))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/kf/ckfaohmrzufzdx2fsmml5navfg2efvvxodj5sf5pujefshnggoot.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.convolution_backward, aten.div, aten.native_batch_norm_backward, aten.threshold_backward
# aten._native_batch_norm_legit_functional => convert_element_type_398
# aten.convolution_backward => convolution_backward
# aten.div => div
# aten.native_batch_norm_backward => convert_element_type_404, convert_element_type_406, mul_476, mul_477, sub_58, sub_60, sub_61
# aten.threshold_backward => scalar_tensor, where
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_div_native_batch_norm_backward_threshold_backward_3 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*i1', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp16', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_div_native_batch_norm_backward_threshold_backward_3(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 9633792
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x4 = (xindex // 49)
x1 = (xindex // 49) % 1536
tmp0 = tl.load(in_ptr0 + (x3), None)
tmp2 = tl.load(in_ptr1 + (x4), None).to(tl.float32)
tmp7 = tl.load(in_ptr2 + (x3), None).to(tl.float32)
tmp9 = tl.load(in_ptr3 + (x1), None)
tmp11 = tl.load(in_ptr4 + (x1), None)
tmp14 = tl.load(in_ptr5 + (x1), None)
tmp19 = tl.load(in_ptr6 + (x1), None)
tmp22 = tl.load(in_ptr7 + (x1), None)
tmp1 = 0.0
tmp3 = 49.0
tmp4 = tmp2 / tmp3
tmp5 = tl.where(tmp0, tmp1, tmp4)
tmp6 = tmp5.to(tl.float32)
tmp8 = tmp7.to(tl.float32)
tmp10 = tmp8 - tmp9
tmp12 = 0.00015943877551020407
tmp13 = tmp11 * tmp12
tmp15 = tmp14 * tmp14
tmp16 = tmp13 * tmp15
tmp17 = tmp10 * tmp16
tmp18 = tmp6 - tmp17
tmp20 = tmp19 * tmp12
tmp21 = tmp18 - tmp20
tmp23 = tmp14 * tmp22
tmp24 = tmp21 * tmp23
tmp25 = tmp24.to(tl.float32)
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp25, None)
def get_args():
arg_0 = rand_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda:0', dtype=torch.bool)
arg_1 = rand_strided((128, 1536), (1536, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((1, 1536, 1, 1), (1536, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
arg_8 = rand_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_div_native_batch_norm_backward_threshold_backward_3.run(*args, 9633792, grid=grid(9633792), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_div_native_batch_norm_backward_threshold_backward_3.benchmark_all_configs(*args, 9633792, grid=grid(9633792))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/on/consb5x4mndq63g7yk4eommoi2zfnpkemgbkcrodivgrodwe4iek.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_407
triton_poi_fused__to_copy_4 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[524288], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_4(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 405504
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None)
def get_args():
arg_0 = rand_strided((1536, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1536, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_4.run(*args, 405504, grid=grid(405504), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_4.benchmark_all_configs(*args, 405504, grid=grid(405504))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/aw/cawn5o3lch25tbuhghexsvhkzdksvz4ofqrpjmzgs2y6ldb5vypm.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_395
# aten.native_batch_norm_backward => convert_element_type_408, mul_479, mul_487, sub_62, sum_4, sum_5
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_5 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 8), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_5(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 264
rnumel = 6272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
tmp5 = tl.load(in_ptr2 + (x0), xmask)
_tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 49
r2 = (rindex // 49)
tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp3 = tl.load(in_ptr1 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp4 = tmp3.to(tl.float32)
tmp6 = tmp4 - tmp5
tmp7 = tmp1 * tmp6
_tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tl.store(out_ptr0 + x0, tmp2, xmask)
tmp8 = tl.sum(_tmp8, 1)[:, None]
tl.store(out_ptr1 + x0, tmp8, xmask)
tmp9 = tl.load(in_ptr3 + (x0), xmask)
tmp10 = tmp8 * tmp9
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
def get_args():
arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_5.run(*args, 264, 6272, grid=grid(264), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_5.benchmark_all_configs(*args, 264, 6272, grid=grid(264))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/rk/crkzi7f3ev6knrr3kmx5lxzwin43giwhtaru4yx4tnkfj3r3ngqj.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_395
# aten.native_batch_norm_backward => convert_element_type_408, convert_element_type_410, mul_485, mul_486, sub_62, sub_64, sub_65
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_6 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp16', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_6(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1655808
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 49) % 264
tmp0 = tl.load(in_ptr0 + (x3), xmask).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x3), xmask).to(tl.float32)
tmp4 = tl.load(in_ptr2 + (x1), xmask)
tmp6 = tl.load(in_ptr3 + (x1), xmask)
tmp9 = tl.load(in_ptr4 + (x1), xmask)
tmp14 = tl.load(in_ptr5 + (x1), xmask)
tmp17 = tl.load(in_ptr6 + (x1), xmask)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp2.to(tl.float32)
tmp5 = tmp3 - tmp4
tmp7 = 0.00015943877551020407
tmp8 = tmp6 * tmp7
tmp10 = tmp9 * tmp9
tmp11 = tmp8 * tmp10
tmp12 = tmp5 * tmp11
tmp13 = tmp1 - tmp12
tmp15 = tmp14 * tmp7
tmp16 = tmp13 - tmp15
tmp18 = tmp9 * tmp17
tmp19 = tmp16 * tmp18
tmp20 = tmp19.to(tl.float32)
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp20, xmask)
def get_args():
arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_6.run(*args, 1655808, grid=grid(1655808), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_6.benchmark_all_configs(*args, 1655808, grid=grid(1655808))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/sf/csf7df3ryvp5mqalyhiqr2oqixspuwgupmdvdghhbdv4hoavgvib.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_411
triton_poi_fused__to_copy_7 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_7(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 104544
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_7.run(*args, 104544, grid=grid(104544), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_7.benchmark_all_configs(*args, 104544, grid=grid(104544))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ph/cphopjqs73kagdjs5yonh2gxou75xcoyv4vg5cyr3nkxcgkuz4lp.py
# Original ATen: aten.cat
# aten.cat => cat_41
triton_poi_fused_cat_8 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_8(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4967424
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 38808
x1 = (xindex // 38808)
tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32)
tl.store(out_ptr0 + (x0 + (77616*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask)
def get_args():
arg_0 = rand_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 792, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_8.run(*args, 4967424, grid=grid(4967424), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_8.benchmark_all_configs(*args, 4967424, grid=grid(4967424))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/cd/ccdy3uu6zwicm5gp4nqvsu7scphal4x56e6djga5sd2lge4kiyge.py
# Original ATen: aten.cat
# aten.cat => cat_41
triton_poi_fused_cat_9 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_9(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4967424
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 38808
x1 = (xindex // 38808)
tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32)
tl.store(out_ptr0 + (x0 + (77616*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask)
def get_args():
arg_0 = rand_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 792, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_9.run(*args, 4967424, grid=grid(4967424), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_9.benchmark_all_configs(*args, 4967424, grid=grid(4967424))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/nt/cnt4hzkf7uueins7in5idjrq44bsdsgn6e67gfrc7gdtwfyotkdg.py
# Original ATen: aten.mul, aten.sigmoid, aten.sigmoid_backward, aten.silu, aten.sum
# aten.mul => mul_488
# aten.sigmoid => sigmoid_63
# aten.sigmoid_backward => convert_element_type_413, convert_element_type_414, convert_element_type_415, mul_490, mul_491, sub_66
# aten.silu => convert_element_type_385, convert_element_type_386, mul_453, sigmoid_61
# aten.sum => sum_6
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_10 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[262144, 64],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_10(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 202752
rnumel = 49
RBLOCK: tl.constexpr = 64
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (r1 + (49*x0)), rmask, other=0).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (r1 + (49*x0)), rmask, other=0).to(tl.float32)
tmp11 = tl.load(in_ptr2 + (x0), None).to(tl.float32)
tmp2 = tmp1.to(tl.float32)
tmp3 = tl.sigmoid(tmp2)
tmp4 = tmp2 * tmp3
tmp5 = tmp4.to(tl.float32)
tmp6 = tmp0 * tmp5
tmp8 = tl.where(rmask, tmp6, 0)
tmp9 = tl.sum(tmp8, 1)[:, None]
tmp10 = tmp9.to(tl.float32)
tmp12 = tl.sigmoid(tmp11)
tmp13 = tmp12.to(tl.float32)
tmp14 = 1.0
tmp15 = tmp14 - tmp13
tmp16 = tmp13 * tmp15
tmp17 = tmp10 * tmp16
tmp18 = tmp17.to(tl.float32)
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp18, None)
def get_args():
arg_0 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_10.run(*args, 202752, 49, grid=grid(202752), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_10.benchmark_all_configs(*args, 202752, 49, grid=grid(202752))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/wr/cwr445l7k4eo6gcjw7l27usycvmf5jtqzbzk4btgcg2wqemciphj.py
# Original ATen: aten._to_copy, aten.convolution_backward
# aten._to_copy => convert_element_type_417
# aten.convolution_backward => sum_7
triton_per_fused__to_copy_convolution_backward_11 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[2048, 128],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__to_copy_convolution_backward_11(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 1584
rnumel = 128
RBLOCK: tl.constexpr = 128
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (1584*r1)), rmask & xmask, other=0).to(tl.float32)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
def get_args():
arg_0 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__to_copy_convolution_backward_11.run(*args, 1584, 128, grid=grid(1584), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__to_copy_convolution_backward_11.benchmark_all_configs(*args, 1584, 128, grid=grid(1584))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/vy/cvylenxoapd6e7xfcegwowbaxf7tmjjits4rifowpwoo5xytchjc.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_416
triton_poi_fused__to_copy_12 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_12(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 209088
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((1584, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1584, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_12.run(*args, 209088, grid=grid(209088), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_12.benchmark_all_configs(*args, 209088, grid=grid(209088))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ku/ckublkguiu5gw5vxkulgbwk62mp5fhr6syry2wrk56p3zdpzupzp.py
# Original ATen: aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub
# aten.add => add_304
# aten.clone => clone_47
# aten.fill => full_like
# aten.mul => mul_492, mul_493, mul_494
# aten.sigmoid => sigmoid_64
# aten.sub => sub_67
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_13 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_add_clone_fill_mul_sigmoid_sub_13(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16896
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = 1.0
tmp4 = tmp3 - tmp2
tmp5 = tmp1 * tmp4
tmp6 = tmp5 + tmp3
tmp7 = tmp2 * tmp6
tmp8 = tmp0 * tmp7
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp8, xmask)
def get_args():
arg_0 = rand_strided((128, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_13.run(*args, 16896, grid=grid(16896), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_add_clone_fill_mul_sigmoid_sub_13.benchmark_all_configs(*args, 16896, grid=grid(16896))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/4b/c4bhcaxi2hruo5hil4xfrkdmmucojrb5nsitzxo2oy2a6qcdg3ti.py
# Original ATen: aten._to_copy, aten.convolution_backward
# aten._to_copy => convert_element_type_419
# aten.convolution_backward => sum_8
triton_per_fused__to_copy_convolution_backward_14 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[256, 128],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__to_copy_convolution_backward_14(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 132
rnumel = 128
RBLOCK: tl.constexpr = 128
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (132*r1)), rmask & xmask, other=0).to(tl.float32)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
def get_args():
arg_0 = rand_strided((128, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((132,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__to_copy_convolution_backward_14.run(*args, 132, 128, grid=grid(132), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__to_copy_convolution_backward_14.benchmark_all_configs(*args, 132, 128, grid=grid(132))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/mo/cmoc7y64jtpxuncjb73voe54cmkijbltrpgf5p7stvr4mny7avka.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_418
triton_poi_fused__to_copy_15 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_15(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 209088
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((132, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((132, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_15.run(*args, 209088, grid=grid(209088), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_15.benchmark_all_configs(*args, 209088, grid=grid(209088))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/et/cetgnuzvhwgtkuxuh7tdewyrx6qabbxsldzd2qnxluqsqq7zwwfh.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub
# aten._native_batch_norm_legit_functional => convert_element_type_383
# aten.add => add_305, add_306
# aten.clone => clone_46
# aten.div => div_1
# aten.fill => full_like_1
# aten.mul => mul_489, mul_495, mul_496, mul_497
# aten.native_batch_norm_backward => convert_element_type_420, mul_498, mul_506, sub_69, sum_10, sum_9
# aten.sigmoid => sigmoid_63, sigmoid_65
# aten.sub => sub_68
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_16 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[2048, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_16(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 1584
rnumel = 6272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
tmp20 = tl.load(in_ptr5 + (x0), xmask)
_tmp23 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 49
r2 = (rindex // 49)
tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (x0 + (1584*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp4 = tl.load(in_ptr2 + (x0 + (1584*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp8 = tl.load(in_ptr3 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp18 = tl.load(in_ptr4 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp0 * tmp2
tmp5 = 49.0
tmp6 = tmp4 / tmp5
tmp7 = tmp3 + tmp6
tmp9 = tl.sigmoid(tmp8)
tmp10 = 1.0
tmp11 = tmp10 - tmp9
tmp12 = tmp8 * tmp11
tmp13 = tmp12 + tmp10
tmp14 = tmp9 * tmp13
tmp15 = tmp7 * tmp14
tmp16 = tmp15.to(tl.float32)
_tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17)
tmp19 = tmp18.to(tl.float32)
tmp21 = tmp19 - tmp20
tmp22 = tmp16 * tmp21
_tmp23 = tl.where(rmask & xmask, _tmp23 + tmp22, _tmp23)
tmp17 = tl.sum(_tmp17, 1)[:, None]
tl.store(out_ptr0 + x0, tmp17, xmask)
tmp23 = tl.sum(_tmp23, 1)[:, None]
tl.store(out_ptr1 + x0, tmp23, xmask)
tmp24 = tl.load(in_ptr6 + (x0), xmask)
tmp25 = tmp23 * tmp24
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp25, xmask)
def get_args():
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_4 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_5 = rand_strided((1, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
arg_8 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
arg_9 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_16.run(*args, 1584, 6272, grid=grid(1584), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_16.benchmark_all_configs(*args, 1584, 6272, grid=grid(1584))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/vq/cvqzgmyak74w6zf55f32ognvdbwijbc4xvxna4rdk6flp32c4rx3.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub
# aten._native_batch_norm_legit_functional => convert_element_type_383
# aten.add => add_305, add_306
# aten.clone => clone_46
# aten.div => div_1
# aten.fill => full_like_1
# aten.mul => mul_489, mul_495, mul_496, mul_497
# aten.native_batch_norm_backward => convert_element_type_420, mul_504, sub_69, sub_71, sub_72
# aten.sigmoid => sigmoid_63, sigmoid_65
# aten.sub => sub_68
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_17 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_17(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 9934848
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x4 = (xindex // 49)
x1 = (xindex // 49) % 1584
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32)
tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32)
tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32)
tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32)
tmp19 = tl.load(in_ptr5 + (x1), None)
tmp21 = tl.load(in_ptr6 + (x1), None)
tmp24 = tl.load(in_ptr7 + (x1), None)
tmp29 = tl.load(in_ptr8 + (x1), None)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp0 * tmp2
tmp5 = 49.0
tmp6 = tmp4 / tmp5
tmp7 = tmp3 + tmp6
tmp9 = tl.sigmoid(tmp8)
tmp10 = 1.0
tmp11 = tmp10 - tmp9
tmp12 = tmp8 * tmp11
tmp13 = tmp12 + tmp10
tmp14 = tmp9 * tmp13
tmp15 = tmp7 * tmp14
tmp16 = tmp15.to(tl.float32)
tmp18 = tmp17.to(tl.float32)
tmp20 = tmp18 - tmp19
tmp22 = 0.00015943877551020407
tmp23 = tmp21 * tmp22
tmp25 = tmp24 * tmp24
tmp26 = tmp23 * tmp25
tmp27 = tmp20 * tmp26
tmp28 = tmp16 - tmp27
tmp30 = tmp29 * tmp22
tmp31 = tmp28 - tmp30
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp31, None)
def get_args():
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_4 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_5 = rand_strided((1, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
arg_8 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
arg_9 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_17.run(*args, 9934848, grid=grid(9934848), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_17.benchmark_all_configs(*args, 9934848, grid=grid(9934848))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/4k/c4k5hrio7ng5ql4zfsckty7aijjdguidt3u7uu2czayedkp6afcd.py
# Original ATen: aten.convolution_backward
# aten.convolution_backward => convolution_backward_5
triton_poi_fused_convolution_backward_18 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_18(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2483712
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = (xindex // 19404)
x3 = xindex % 19404
x1 = (xindex // 49) % 396
x4 = xindex
tmp0 = tl.load(in_ptr0 + (58212 + x3 + (77616*x2)), xmask)
tmp1 = tl.load(in_ptr1 + (1188 + x1), xmask)
tmp2 = tl.load(in_ptr2 + (1188 + x1), xmask)
tmp3 = tmp1 * tmp2
tmp4 = tmp0 * tmp3
tmp5 = tmp4.to(tl.float32)
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, xmask)
def get_args():
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_convolution_backward_18.run(*args, 2483712, grid=grid(2483712), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_convolution_backward_18.benchmark_all_configs(*args, 2483712, grid=grid(2483712))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/le/clevrgmaajodoycxcxjtmbwkkhfu2zyqxjmzkxois2uwwej5n47c.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_423
triton_poi_fused__to_copy_19 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_19(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 32076
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((396, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((396, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_19.run(*args, 32076, grid=grid(32076), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_19.benchmark_all_configs(*args, 32076, grid=grid(32076))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/yx/cyxchtoyxooc3k4ah4othvbsdd77dfkbeykdlyzawbwqv4zeq227.py
# Original ATen: aten.convolution_backward
# aten.convolution_backward => convolution_backward_6
triton_poi_fused_convolution_backward_20 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_20(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2483712
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = (xindex // 19404)
x3 = xindex % 19404
x1 = (xindex // 49) % 396
x4 = xindex
tmp0 = tl.load(in_ptr0 + (38808 + x3 + (77616*x2)), xmask)
tmp1 = tl.load(in_ptr1 + (792 + x1), xmask)
tmp2 = tl.load(in_ptr2 + (792 + x1), xmask)
tmp3 = tmp1 * tmp2
tmp4 = tmp0 * tmp3
tmp5 = tmp4.to(tl.float32)
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, xmask)
def get_args():
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_convolution_backward_20.run(*args, 2483712, grid=grid(2483712), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_convolution_backward_20.benchmark_all_configs(*args, 2483712, grid=grid(2483712))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/eb/cebicyjdxyev2ctnv4eqoljt3ynagd7ac2xafkomzznzw2akhxue.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_424
triton_poi_fused__to_copy_21 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_21(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 19404
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((396, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((396, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_21.run(*args, 19404, grid=grid(19404), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_21.benchmark_all_configs(*args, 19404, grid=grid(19404))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/n6/cn6zlzicefkme4u5z2u7vgzu7zs36znck62asy3a53dhtgwtnfey.py
# Original ATen: aten.convolution_backward
# aten.convolution_backward => convolution_backward_7
triton_poi_fused_convolution_backward_22 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_22(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2483712
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = (xindex // 19404)
x3 = xindex % 19404
x1 = (xindex // 49) % 396
x4 = xindex
tmp0 = tl.load(in_ptr0 + (19404 + x3 + (77616*x2)), xmask)
tmp1 = tl.load(in_ptr1 + (396 + x1), xmask)
tmp2 = tl.load(in_ptr2 + (396 + x1), xmask)
tmp3 = tmp1 * tmp2
tmp4 = tmp0 * tmp3
tmp5 = tmp4.to(tl.float32)
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, xmask)
def get_args():
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_convolution_backward_22.run(*args, 2483712, grid=grid(2483712), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_convolution_backward_22.benchmark_all_configs(*args, 2483712, grid=grid(2483712))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/5j/c5jiqoftvlcj3ob2vawgml2t4mfu74vouj3eu6ozelrenm5a7frc.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_425
triton_poi_fused__to_copy_23 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_23(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 9900
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((396, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((396, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_23.run(*args, 9900, grid=grid(9900), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_23.benchmark_all_configs(*args, 9900, grid=grid(9900))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/wd/cwd5qp57lsltp4fq4manegrstgskzimmaukeha4dh4fnqhtwaydf.py
# Original ATen: aten.convolution_backward
# aten.convolution_backward => convolution_backward_8
triton_poi_fused_convolution_backward_24 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_24(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2483712
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = (xindex // 19404)
x3 = xindex % 19404
x1 = (xindex // 49) % 396
x4 = xindex
tmp0 = tl.load(in_ptr0 + (x3 + (77616*x2)), xmask)
tmp1 = tl.load(in_ptr1 + (x1), xmask)
tmp2 = tl.load(in_ptr2 + (x1), xmask)
tmp3 = tmp1 * tmp2
tmp4 = tmp0 * tmp3
tmp5 = tmp4.to(tl.float32)
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, xmask)
def get_args():
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_convolution_backward_24.run(*args, 2483712, grid=grid(2483712), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_convolution_backward_24.benchmark_all_configs(*args, 2483712, grid=grid(2483712))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/wc/cwc7i4csdkpcwn5beyvhsemwhtbq7gxbezkowlgoa2xhw4y7krz2.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_426
triton_poi_fused__to_copy_25 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_25(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3564
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((396, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((396, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_25.run(*args, 3564, grid=grid(3564), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_25.benchmark_all_configs(*args, 3564, grid=grid(3564))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/cp/ccpi5jja3x5efmayn24o6k4nqs6v2zc5jff75of6woeln6emtny2.py
# Original ATen: aten.cat
# aten.cat => cat_42
triton_poi_fused_cat_26 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_26(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2483712
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 19404
x1 = (xindex // 19404)
tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32)
tl.store(out_ptr0 + (x0 + (77616*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask)
def get_args():
arg_0 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 396, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_26.run(*args, 2483712, grid=grid(2483712), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_26.benchmark_all_configs(*args, 2483712, grid=grid(2483712))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/rg/crgc5ihkw5zltfd3tfchnahu2yx2wga4mambpixjsb7mmeynkmrh.py
# Original ATen: aten.cat
# aten.cat => cat_42
triton_poi_fused_cat_27 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_27(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2483712
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 19404
x1 = (xindex // 19404)
tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32)
tl.store(out_ptr0 + (x0 + (77616*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask)
def get_args():
arg_0 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 396, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_27.run(*args, 2483712, grid=grid(2483712), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_27.benchmark_all_configs(*args, 2483712, grid=grid(2483712))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/hq/chqi5nwugaohgw3qon7t3fytiqnvkhkemtghder4rphksh2dt7uc.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mul, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_375
# aten.mul => mul_509
# aten.native_batch_norm_backward => convert_element_type_427, mul_510, mul_518, sub_74, sum_11, sum_12
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_28 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[2048, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_28(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 1584
rnumel = 6272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
tmp7 = tl.load(in_ptr3 + (x0), xmask)
_tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 49
r2 = (rindex // 49)
tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp5 = tl.load(in_ptr2 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp2 = tmp0 * tmp1
tmp3 = tmp2.to(tl.float32)
_tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4)
tmp6 = tmp5.to(tl.float32)
tmp8 = tmp6 - tmp7
tmp9 = tmp3 * tmp8
_tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10)
tmp4 = tl.sum(_tmp4, 1)[:, None]
tl.store(out_ptr0 + x0, tmp4, xmask)
tmp10 = tl.sum(_tmp10, 1)[:, None]
tl.store(out_ptr1 + x0, tmp10, xmask)
tmp11 = tl.load(in_ptr4 + (x0), xmask)
tmp12 = tmp10 * tmp11
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp12, xmask)
def get_args():
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((1, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_28.run(*args, 1584, 6272, grid=grid(1584), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_28.benchmark_all_configs(*args, 1584, 6272, grid=grid(1584))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/4i/c4iqo4zwyswlhwfnswymdycvvcfufifa735bkh3k5fjuqlgdvt3q.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.convolution_backward, aten.mul, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_375
# aten.convolution_backward => convolution_backward_9
# aten.mul => mul_509
# aten.native_batch_norm_backward => convert_element_type_427, convert_element_type_429, mul_516, mul_517, sub_74, sub_76, sub_77
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_29 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_29(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr):
xnumel = 9934848
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 49) % 1584
tmp0 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp4 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
tmp6 = tl.load(in_ptr2 + (x1), None)
tmp8 = tl.load(in_ptr3 + (x1), None)
tmp11 = tl.load(in_ptr4 + (x1), None)
tmp16 = tl.load(in_ptr5 + (x1), None)
tmp19 = tl.load(in_ptr6 + (x1), None)
tmp2 = tmp0 * tmp1
tmp3 = tmp2.to(tl.float32)
tmp5 = tmp4.to(tl.float32)
tmp7 = tmp5 - tmp6
tmp9 = 0.00015943877551020407
tmp10 = tmp8 * tmp9
tmp12 = tmp11 * tmp11
tmp13 = tmp10 * tmp12
tmp14 = tmp7 * tmp13
tmp15 = tmp3 - tmp14
tmp17 = tmp16 * tmp9
tmp18 = tmp15 - tmp17
tmp20 = tmp11 * tmp19
tmp21 = tmp18 * tmp20
tmp22 = tmp21.to(tl.float32)
tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None)
def get_args():
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((1, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_29.run(*args, 9934848, grid=grid(9934848), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_29.benchmark_all_configs(*args, 9934848, grid=grid(9934848))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/np/cnpskkb6q6odcd2dqw7z7r4e7p4cgfh5g7s5kyt2zmqwzsyn7azj.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_430
triton_poi_fused__to_copy_30 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[524288], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_30(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 418176
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((1584, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1584, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_30.run(*args, 418176, grid=grid(418176), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_30.benchmark_all_configs(*args, 418176, grid=grid(418176))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ut/cuta2ldfadubtnwekrao3mdwks6gtwerw2rxvha7uaqnloaeqvfv.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_372
# aten.add => add_308
# aten.native_batch_norm_backward => convert_element_type_431, mul_519, mul_527, sub_78, sum_13, sum_14
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_31 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_31(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 264
rnumel = 6272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
tmp7 = tl.load(in_ptr3 + (x0), xmask)
_tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 49
r2 = (rindex // 49)
tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp5 = tl.load(in_ptr2 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp2 = tmp0 + tmp1
tmp3 = tmp2.to(tl.float32)
_tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4)
tmp6 = tmp5.to(tl.float32)
tmp8 = tmp6 - tmp7
tmp9 = tmp3 * tmp8
_tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10)
tmp4 = tl.sum(_tmp4, 1)[:, None]
tl.store(out_ptr0 + x0, tmp4, xmask)
tmp10 = tl.sum(_tmp10, 1)[:, None]
tl.store(out_ptr1 + x0, tmp10, xmask)
tmp11 = tl.load(in_ptr4 + (x0), xmask)
tmp12 = tmp10 * tmp11
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp12, xmask)
def get_args():
arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_31.run(*args, 264, 6272, grid=grid(264), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_31.benchmark_all_configs(*args, 264, 6272, grid=grid(264))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/a3/ca33o5ukggucnj2hrtjksvzkzfzfe3v4qnwzz5ycytjwaauqcvop.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_372
# aten.add => add_308
# aten.native_batch_norm_backward => convert_element_type_431, convert_element_type_433, mul_525, mul_526, sub_78, sub_80, sub_81
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_32 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp16', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_32(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1655808
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 49) % 264
tmp0 = tl.load(in_ptr0 + (x3), xmask).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (x3), xmask).to(tl.float32)
tmp4 = tl.load(in_ptr2 + (x3), xmask).to(tl.float32)
tmp6 = tl.load(in_ptr3 + (x1), xmask)
tmp8 = tl.load(in_ptr4 + (x1), xmask)
tmp11 = tl.load(in_ptr5 + (x1), xmask)
tmp16 = tl.load(in_ptr6 + (x1), xmask)
tmp19 = tl.load(in_ptr7 + (x1), xmask)
tmp2 = tmp0 + tmp1
tmp3 = tmp2.to(tl.float32)
tmp5 = tmp4.to(tl.float32)
tmp7 = tmp5 - tmp6
tmp9 = 0.00015943877551020407
tmp10 = tmp8 * tmp9
tmp12 = tmp11 * tmp11
tmp13 = tmp10 * tmp12
tmp14 = tmp7 * tmp13
tmp15 = tmp3 - tmp14
tmp17 = tmp16 * tmp9
tmp18 = tmp15 - tmp17
tmp20 = tmp11 * tmp19
tmp21 = tmp18 * tmp20
tmp22 = tmp21.to(tl.float32)
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, xmask)
def get_args():
arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_8 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_32.run(*args, 1655808, grid=grid(1655808), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_32.benchmark_all_configs(*args, 1655808, grid=grid(1655808))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/hd/chdicakkg6zydfy4ll5rit2mnugrbccllk6kssfzzb7i55y4shxu.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_349
# aten.add => add_308, add_313
# aten.native_batch_norm_backward => convert_element_type_454, mul_559, mul_567, sub_94, sum_22, sum_23
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_33 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: 'i32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 10), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_33(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 264
rnumel = 6272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
tmp9 = tl.load(in_ptr4 + (x0), xmask)
_tmp12 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 49
r2 = (rindex // 49)
tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp3 = tl.load(in_ptr2 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp7 = tl.load(in_ptr3 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp2 = tmp0 + tmp1
tmp4 = tmp2 + tmp3
tmp5 = tmp4.to(tl.float32)
_tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6)
tmp8 = tmp7.to(tl.float32)
tmp10 = tmp8 - tmp9
tmp11 = tmp5 * tmp10
_tmp12 = tl.where(rmask & xmask, _tmp12 + tmp11, _tmp12)
tmp6 = tl.sum(_tmp6, 1)[:, None]
tl.store(out_ptr0 + x0, tmp6, xmask)
tmp12 = tl.sum(_tmp12, 1)[:, None]
tl.store(out_ptr1 + x0, tmp12, xmask)
tmp13 = tl.load(in_ptr5 + (x0), xmask)
tmp14 = tmp12 * tmp13
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp14, xmask)
def get_args():
arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_4 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_8 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_33.run(*args, 264, 6272, grid=grid(264), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_33.benchmark_all_configs(*args, 264, 6272, grid=grid(264))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/bi/cbij3cjdqcet3nxl6uy6e6kdaxljedzlmbuwgif6yfmwghh7x42z.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_349
# aten.add => add_308, add_313
# aten.native_batch_norm_backward => convert_element_type_454, mul_565, mul_566, sub_94, sub_96, sub_97
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_34 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_34(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1655808
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 49) % 264
tmp0 = tl.load(in_ptr0 + (x3), xmask).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (x3), xmask).to(tl.float32)
tmp3 = tl.load(in_ptr2 + (x3), xmask).to(tl.float32)
tmp6 = tl.load(in_ptr3 + (x3), xmask).to(tl.float32)
tmp8 = tl.load(in_ptr4 + (x1), xmask)
tmp10 = tl.load(in_ptr5 + (x1), xmask)
tmp13 = tl.load(in_ptr6 + (x1), xmask)
tmp18 = tl.load(in_ptr7 + (x1), xmask)
tmp21 = tl.load(in_ptr8 + (x1), xmask)
tmp2 = tmp0 + tmp1
tmp4 = tmp2 + tmp3
tmp5 = tmp4.to(tl.float32)
tmp7 = tmp6.to(tl.float32)
tmp9 = tmp7 - tmp8
tmp11 = 0.00015943877551020407
tmp12 = tmp10 * tmp11
tmp14 = tmp13 * tmp13
tmp15 = tmp12 * tmp14
tmp16 = tmp9 * tmp15
tmp17 = tmp5 - tmp16
tmp19 = tmp18 * tmp11
tmp20 = tmp17 - tmp19
tmp22 = tmp13 * tmp21
tmp23 = tmp20 * tmp22
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp23, xmask)
def get_args():
arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_4 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_8 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_9 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_34.run(*args, 1655808, grid=grid(1655808), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_34.benchmark_all_configs(*args, 1655808, grid=grid(1655808))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/bp/cbpqapqwkj3u7lbjggnc6sqbtdedbchks5ecsczowcbbvpcuekir.py
# Original ATen: aten.convolution_backward
# aten.convolution_backward => convolution_backward_19
triton_poi_fused_convolution_backward_35 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[1048576], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_35(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 827904
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 6468
x1 = (xindex // 6468)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (6468 + x0 + (12936*x1)), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((128, 132, 7, 7), (6468, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_convolution_backward_35.run(*args, 827904, grid=grid(827904), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_convolution_backward_35.benchmark_all_configs(*args, 827904, grid=grid(827904))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/vz/cvzmxblv4iaxi55wdmzkqdehpwn2a2c5kowaaewxyzttkqwmv7jg.py
# Original ATen: aten.convolution_backward
# aten.convolution_backward => convolution_backward_20
triton_poi_fused_convolution_backward_36 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[1048576], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_36(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 827904
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 6468
x1 = (xindex // 6468)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (12936*x1)), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((128, 132, 7, 7), (6468, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_convolution_backward_36.run(*args, 827904, grid=grid(827904), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_convolution_backward_36.benchmark_all_configs(*args, 827904, grid=grid(827904))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/sb/csb7dsvsxbdzlofqw5ldfragxm7igefprtjmu6wizsp5shcyxiyp.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_326
# aten.add => add_308, add_313, add_318
# aten.native_batch_norm_backward => convert_element_type_477, mul_599, mul_607, sub_110, sum_31, sum_32
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_37 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_37(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 264
rnumel = 6272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
tmp11 = tl.load(in_ptr5 + (x0), xmask)
_tmp14 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 49
r2 = (rindex // 49)
tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp3 = tl.load(in_ptr2 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp5 = tl.load(in_ptr3 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp9 = tl.load(in_ptr4 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp2 = tmp0 + tmp1
tmp4 = tmp2 + tmp3
tmp6 = tmp4 + tmp5
tmp7 = tmp6.to(tl.float32)
_tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8)
tmp10 = tmp9.to(tl.float32)
tmp12 = tmp10 - tmp11
tmp13 = tmp7 * tmp12
_tmp14 = tl.where(rmask & xmask, _tmp14 + tmp13, _tmp14)
tmp8 = tl.sum(_tmp8, 1)[:, None]
tl.store(out_ptr0 + x0, tmp8, xmask)
tmp14 = tl.sum(_tmp14, 1)[:, None]
tl.store(out_ptr1 + x0, tmp14, xmask)
tmp15 = tl.load(in_ptr6 + (x0), xmask)
tmp16 = tmp14 * tmp15
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask)
def get_args():
arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_4 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_5 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_8 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_9 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_37.run(*args, 264, 6272, grid=grid(264), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_37.benchmark_all_configs(*args, 264, 6272, grid=grid(264))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/gx/cgx3bvwbhn4322e66zkvf7xmr6ch3vg75ylfjn7yzts64xagqr2l.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.convolution_backward, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_326
# aten.add => add_308, add_313, add_318
# aten.convolution_backward => convolution_backward_28
# aten.native_batch_norm_backward => convert_element_type_477, convert_element_type_479, mul_605, mul_606, sub_110, sub_112, sub_113
triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_38 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: '*fp16', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_38(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, in_ptr9, out_ptr1, xnumel, XBLOCK : tl.constexpr):
xnumel = 1655808
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 49) % 264
tmp0 = tl.load(in_ptr0 + (x3), xmask).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (x3), xmask).to(tl.float32)
tmp3 = tl.load(in_ptr2 + (x3), xmask).to(tl.float32)
tmp5 = tl.load(in_ptr3 + (x3), xmask).to(tl.float32)
tmp8 = tl.load(in_ptr4 + (x3), xmask).to(tl.float32)
tmp10 = tl.load(in_ptr5 + (x1), xmask)
tmp12 = tl.load(in_ptr6 + (x1), xmask)
tmp15 = tl.load(in_ptr7 + (x1), xmask)
tmp20 = tl.load(in_ptr8 + (x1), xmask)
tmp23 = tl.load(in_ptr9 + (x1), xmask)
tmp2 = tmp0 + tmp1
tmp4 = tmp2 + tmp3
tmp6 = tmp4 + tmp5
tmp7 = tmp6.to(tl.float32)
tmp9 = tmp8.to(tl.float32)
tmp11 = tmp9 - tmp10
tmp13 = 0.00015943877551020407
tmp14 = tmp12 * tmp13
tmp16 = tmp15 * tmp15
tmp17 = tmp14 * tmp16
tmp18 = tmp11 * tmp17
tmp19 = tmp7 - tmp18
tmp21 = tmp20 * tmp13
tmp22 = tmp19 - tmp21
tmp24 = tmp15 * tmp23
tmp25 = tmp22 * tmp24
tmp26 = tmp25.to(tl.float32)
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp26, xmask)
def get_args():
arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_4 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_5 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_8 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_9 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_10 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, arg_10,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_38.run(*args, 1655808, grid=grid(1655808), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_38.benchmark_all_configs(*args, 1655808, grid=grid(1655808))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/hv/chvxalrqvfx3gumbgl7gg6d4lhqqg37lbadisxr7scsc3tjuvlp6.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_480
triton_poi_fused__to_copy_39 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_39(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 253440
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((264, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((264, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_39.run(*args, 253440, grid=grid(253440), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_39.benchmark_all_configs(*args, 253440, grid=grid(253440))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/4c/c4cxprfywuviqaunnoosyvctdmc2wg7xuk2qc4eawthvjfi2nh6c.py
# Original ATen: aten.mul, aten.sigmoid, aten.sigmoid_backward, aten.silu, aten.sum
# aten.mul => mul_608
# aten.sigmoid => sigmoid_51
# aten.sigmoid_backward => convert_element_type_481, convert_element_type_482, convert_element_type_483, mul_610, mul_611, sub_114
# aten.silu => convert_element_type_317, convert_element_type_318, mul_378, sigmoid_49
# aten.sum => sum_33
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_40 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[131072, 64],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_40(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 122880
rnumel = 49
RBLOCK: tl.constexpr = 64
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (r1 + (49*x0)), rmask, other=0).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (r1 + (49*x0)), rmask, other=0).to(tl.float32)
tmp11 = tl.load(in_ptr2 + (x0), None).to(tl.float32)
tmp2 = tmp1.to(tl.float32)
tmp3 = tl.sigmoid(tmp2)
tmp4 = tmp2 * tmp3
tmp5 = tmp4.to(tl.float32)
tmp6 = tmp0 * tmp5
tmp8 = tl.where(rmask, tmp6, 0)
tmp9 = tl.sum(tmp8, 1)[:, None]
tmp10 = tmp9.to(tl.float32)
tmp12 = tl.sigmoid(tmp11)
tmp13 = tmp12.to(tl.float32)
tmp14 = 1.0
tmp15 = tmp14 - tmp13
tmp16 = tmp13 * tmp15
tmp17 = tmp10 * tmp16
tmp18 = tmp17.to(tl.float32)
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp18, None)
def get_args():
arg_0 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_40.run(*args, 122880, 49, grid=grid(122880), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_40.benchmark_all_configs(*args, 122880, 49, grid=grid(122880))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/c3/cc3ozjdbl56ribso2ymcx7ozxswqoswegte5cm6ndwts2vdg2nx7.py
# Original ATen: aten._to_copy, aten.convolution_backward
# aten._to_copy => convert_element_type_485
# aten.convolution_backward => sum_34
triton_per_fused__to_copy_convolution_backward_41 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[1024, 128],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__to_copy_convolution_backward_41(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 960
rnumel = 128
RBLOCK: tl.constexpr = 128
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (960*r1)), rmask & xmask, other=0).to(tl.float32)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
def get_args():
arg_0 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__to_copy_convolution_backward_41.run(*args, 960, 128, grid=grid(960), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__to_copy_convolution_backward_41.benchmark_all_configs(*args, 960, 128, grid=grid(960))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/gr/cgrr3v6svqr3e2yhrucjdveyox77wsfeqkauijcny3rj2xckclad.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_484
triton_poi_fused__to_copy_42 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_42(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 76800
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((960, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((960, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_42.run(*args, 76800, grid=grid(76800), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_42.benchmark_all_configs(*args, 76800, grid=grid(76800))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ec/cecvpjtjrr7ig7pxbkmaghimjhrgacci4o2zw2nv422yxcdjktnf.py
# Original ATen: aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub
# aten.add => add_319
# aten.clone => clone_38
# aten.fill => full_like_9
# aten.mul => mul_612, mul_613, mul_614
# aten.sigmoid => sigmoid_73
# aten.sub => sub_115
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_43 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_add_clone_fill_mul_sigmoid_sub_43(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 10240
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_out_ptr0 + (x0), None).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = 1.0
tmp4 = tmp3 - tmp2
tmp5 = tmp1 * tmp4
tmp6 = tmp5 + tmp3
tmp7 = tmp2 * tmp6
tmp8 = tmp0 * tmp7
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp8, None)
def get_args():
arg_0 = rand_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_43.run(*args, 10240, grid=grid(10240), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_add_clone_fill_mul_sigmoid_sub_43.benchmark_all_configs(*args, 10240, grid=grid(10240))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/tf/ctfzhkol6r3oykhxrse2zzxlztwo7vdgfbyprm7vgxoac6npm7jy.py
# Original ATen: aten._to_copy, aten.convolution_backward
# aten._to_copy => convert_element_type_487
# aten.convolution_backward => sum_35
triton_per_fused__to_copy_convolution_backward_44 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[128, 128],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__to_copy_convolution_backward_44(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 80
rnumel = 128
RBLOCK: tl.constexpr = 128
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (80*r1)), rmask & xmask, other=0).to(tl.float32)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
def get_args():
arg_0 = rand_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((80,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__to_copy_convolution_backward_44.run(*args, 80, 128, grid=grid(80), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__to_copy_convolution_backward_44.benchmark_all_configs(*args, 80, 128, grid=grid(80))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/pa/cpatc2nbbyd25gudjvjrjc54kh5oivkwx63acex4sswslv6ylzvi.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_486
triton_poi_fused__to_copy_45 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_45(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 76800
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((80, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((80, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_45.run(*args, 76800, grid=grid(76800), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_45.benchmark_all_configs(*args, 76800, grid=grid(76800))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/t2/ct2iy46nsnqymwwljjboc6oz6xdfqwrjdsjacocdhzt74zuufoj7.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub
# aten._native_batch_norm_legit_functional => convert_element_type_315
# aten.add => add_320, add_321
# aten.clone => clone_37
# aten.div => div_4
# aten.fill => full_like_10
# aten.mul => mul_609, mul_615, mul_616, mul_617
# aten.native_batch_norm_backward => convert_element_type_488, mul_618, mul_626, sub_117, sum_36, sum_37
# aten.sigmoid => sigmoid_51, sigmoid_74
# aten.sub => sub_116
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_46 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_46(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 960
rnumel = 6272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
tmp20 = tl.load(in_ptr5 + (x0), xmask)
_tmp23 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 49
r2 = (rindex // 49)
tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (47040*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (x0 + (960*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp4 = tl.load(in_ptr2 + (x0 + (960*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp8 = tl.load(in_ptr3 + (r1 + (49*x0) + (47040*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp18 = tl.load(in_ptr4 + (r1 + (49*x0) + (47040*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp0 * tmp2
tmp5 = 49.0
tmp6 = tmp4 / tmp5
tmp7 = tmp3 + tmp6
tmp9 = tl.sigmoid(tmp8)
tmp10 = 1.0
tmp11 = tmp10 - tmp9
tmp12 = tmp8 * tmp11
tmp13 = tmp12 + tmp10
tmp14 = tmp9 * tmp13
tmp15 = tmp7 * tmp14
tmp16 = tmp15.to(tl.float32)
_tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17)
tmp19 = tmp18.to(tl.float32)
tmp21 = tmp19 - tmp20
tmp22 = tmp16 * tmp21
_tmp23 = tl.where(rmask & xmask, _tmp23 + tmp22, _tmp23)
tmp17 = tl.sum(_tmp17, 1)[:, None]
tl.store(out_ptr0 + x0, tmp17, xmask)
tmp23 = tl.sum(_tmp23, 1)[:, None]
tl.store(out_ptr1 + x0, tmp23, xmask)
tmp24 = tl.load(in_ptr6 + (x0), xmask)
tmp25 = tmp23 * tmp24
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp25, xmask)
def get_args():
arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_4 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_5 = rand_strided((1, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_8 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_9 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_46.run(*args, 960, 6272, grid=grid(960), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_46.benchmark_all_configs(*args, 960, 6272, grid=grid(960))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/hn/chn3tshreitokdet2wgwgbsbuqqplci2k63sdatoy3lq6bjeylvs.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub
# aten._native_batch_norm_legit_functional => convert_element_type_315
# aten.add => add_320, add_321
# aten.clone => clone_37
# aten.div => div_4
# aten.fill => full_like_10
# aten.mul => mul_609, mul_615, mul_616, mul_617
# aten.native_batch_norm_backward => convert_element_type_488, mul_624, sub_117, sub_119, sub_120
# aten.sigmoid => sigmoid_51, sigmoid_74
# aten.sub => sub_116
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_47 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_47(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 6021120
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x4 = (xindex // 49)
x1 = (xindex // 49) % 960
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32)
tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32)
tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32)
tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32)
tmp19 = tl.load(in_ptr5 + (x1), None)
tmp21 = tl.load(in_ptr6 + (x1), None)
tmp24 = tl.load(in_ptr7 + (x1), None)
tmp29 = tl.load(in_ptr8 + (x1), None)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp0 * tmp2
tmp5 = 49.0
tmp6 = tmp4 / tmp5
tmp7 = tmp3 + tmp6
tmp9 = tl.sigmoid(tmp8)
tmp10 = 1.0
tmp11 = tmp10 - tmp9
tmp12 = tmp8 * tmp11
tmp13 = tmp12 + tmp10
tmp14 = tmp9 * tmp13
tmp15 = tmp7 * tmp14
tmp16 = tmp15.to(tl.float32)
tmp18 = tmp17.to(tl.float32)
tmp20 = tmp18 - tmp19
tmp22 = 0.00015943877551020407
tmp23 = tmp21 * tmp22
tmp25 = tmp24 * tmp24
tmp26 = tmp23 * tmp25
tmp27 = tmp20 * tmp26
tmp28 = tmp16 - tmp27
tmp30 = tmp29 * tmp22
tmp31 = tmp28 - tmp30
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp31, None)
def get_args():
arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_4 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_5 = rand_strided((1, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_8 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_9 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_47.run(*args, 6021120, grid=grid(6021120), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_47.benchmark_all_configs(*args, 6021120, grid=grid(6021120))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/cm/ccm5w6opbpyzojqp4qntjyz3d5dzh7ybk3ty3hgcp6xmwewzvdux.py
# Original ATen: aten.convolution_backward
# aten.convolution_backward => convolution_backward_31
triton_poi_fused_convolution_backward_48 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_48(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1505280
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = (xindex // 11760)
x3 = xindex % 11760
x1 = (xindex // 49) % 240
x4 = xindex
tmp0 = tl.load(in_ptr0 + (35280 + x3 + (47040*x2)), None)
tmp1 = tl.load(in_ptr1 + (720 + x1), None)
tmp2 = tl.load(in_ptr2 + (720 + x1), None)
tmp3 = tmp1 * tmp2
tmp4 = tmp0 * tmp3
tmp5 = tmp4.to(tl.float32)
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)
def get_args():
arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((128, 240, 7, 7), (11760, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_convolution_backward_48.run(*args, 1505280, grid=grid(1505280), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_convolution_backward_48.benchmark_all_configs(*args, 1505280, grid=grid(1505280))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/bg/cbgklbbew275guj34lbxmgcsod4kcchr5225smzpdyhnn7zswlfw.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_491
triton_poi_fused__to_copy_49 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_49(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 19440
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((240, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((240, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_49.run(*args, 19440, grid=grid(19440), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_49.benchmark_all_configs(*args, 19440, grid=grid(19440))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/aa/caatae3nbvudrfnsq366oeojppmx457zjpvs4yumsehtdmsg23py.py
# Original ATen: aten.convolution_backward
# aten.convolution_backward => convolution_backward_32
triton_poi_fused_convolution_backward_50 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_50(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1505280
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = (xindex // 11760)
x3 = xindex % 11760
x1 = (xindex // 49) % 240
x4 = xindex
tmp0 = tl.load(in_ptr0 + (23520 + x3 + (47040*x2)), None)
tmp1 = tl.load(in_ptr1 + (480 + x1), None)
tmp2 = tl.load(in_ptr2 + (480 + x1), None)
tmp3 = tmp1 * tmp2
tmp4 = tmp0 * tmp3
tmp5 = tmp4.to(tl.float32)
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)
def get_args():
arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((128, 240, 7, 7), (11760, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_convolution_backward_50.run(*args, 1505280, grid=grid(1505280), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_convolution_backward_50.benchmark_all_configs(*args, 1505280, grid=grid(1505280))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/iy/ciyzljcwrx3unj6zlla5nxhjqwjz6zppzmuz7wma6tcow2pjfjva.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_492
triton_poi_fused__to_copy_51 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_51(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 11760
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((240, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((240, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_51.run(*args, 11760, grid=grid(11760), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_51.benchmark_all_configs(*args, 11760, grid=grid(11760))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/6i/c6izu5jaho2aqwpjlvtonq5ar6inxincvvpnnrzzj3npyab2r3ga.py
# Original ATen: aten.convolution_backward
# aten.convolution_backward => convolution_backward_33
triton_poi_fused_convolution_backward_52 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_52(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1505280
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = (xindex // 11760)
x3 = xindex % 11760
x1 = (xindex // 49) % 240
x4 = xindex
tmp0 = tl.load(in_ptr0 + (11760 + x3 + (47040*x2)), None)
tmp1 = tl.load(in_ptr1 + (240 + x1), None)
tmp2 = tl.load(in_ptr2 + (240 + x1), None)
tmp3 = tmp1 * tmp2
tmp4 = tmp0 * tmp3
tmp5 = tmp4.to(tl.float32)
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)
def get_args():
arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((128, 240, 7, 7), (11760, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_convolution_backward_52.run(*args, 1505280, grid=grid(1505280), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_convolution_backward_52.benchmark_all_configs(*args, 1505280, grid=grid(1505280))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/cq/ccq24kfa5wdpfsiaanwmmkcev4g2dsxwzynkl2fv553qth6ixrfz.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_493
triton_poi_fused__to_copy_53 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_53(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 6000
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((240, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((240, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_53.run(*args, 6000, grid=grid(6000), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_53.benchmark_all_configs(*args, 6000, grid=grid(6000))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/iu/ciu2jjei5ptmwjfdzudopltuvgs5jt6yeve7jcdvodi4ozwv2j6u.py
# Original ATen: aten.convolution_backward
# aten.convolution_backward => convolution_backward_34
triton_poi_fused_convolution_backward_54 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_54(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1505280
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = (xindex // 11760)
x3 = xindex % 11760
x1 = (xindex // 49) % 240
x4 = xindex
tmp0 = tl.load(in_ptr0 + (x3 + (47040*x2)), None)
tmp1 = tl.load(in_ptr1 + (x1), None)
tmp2 = tl.load(in_ptr2 + (x1), None)
tmp3 = tmp1 * tmp2
tmp4 = tmp0 * tmp3
tmp5 = tmp4.to(tl.float32)
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)
def get_args():
arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((128, 240, 7, 7), (11760, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_convolution_backward_54.run(*args, 1505280, grid=grid(1505280), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_convolution_backward_54.benchmark_all_configs(*args, 1505280, grid=grid(1505280))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/vw/cvw5bsibqyft5fe6qyposdjgeixpccxa5ewo6nsyrtdsomfikzwt.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_494
triton_poi_fused__to_copy_55 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_55(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2160
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((240, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((240, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_55.run(*args, 2160, grid=grid(2160), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_55.benchmark_all_configs(*args, 2160, grid=grid(2160))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/zd/czdohgrrggyfbylj276pqwjy7zlv5b2cu55bzfn6nk26nmdmxmon.py
# Original ATen: aten.cat
# aten.cat => cat_47
triton_poi_fused_cat_56 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_56(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 6021120
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 47040
x1 = (xindex // 47040)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (188160*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 240, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_56.run(*args, 6021120, grid=grid(6021120), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_56.benchmark_all_configs(*args, 6021120, grid=grid(6021120))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/6g/c6guclv6h7xk7kazbyjy6st57pkopg5lzsmit37ifpr7d46xtfho.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mul, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_307
# aten.mul => mul_629
# aten.native_batch_norm_backward => convert_element_type_495, mul_630, mul_638, sub_122, sum_38, sum_39
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_57 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 32768],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_57(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 960
rnumel = 25088
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
tmp7 = tl.load(in_ptr3 + (x0), xmask)
_tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 196
r2 = (rindex // 196)
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (r1 + (196*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp5 = tl.load(in_ptr2 + (r1 + (196*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp2 = tmp0 * tmp1
tmp3 = tmp2.to(tl.float32)
_tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4)
tmp6 = tmp5.to(tl.float32)
tmp8 = tmp6 - tmp7
tmp9 = tmp3 * tmp8
_tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10)
tmp4 = tl.sum(_tmp4, 1)[:, None]
tl.store(out_ptr0 + x0, tmp4, xmask)
tmp10 = tl.sum(_tmp10, 1)[:, None]
tl.store(out_ptr1 + x0, tmp10, xmask)
tmp11 = tl.load(in_ptr4 + (x0), xmask)
tmp12 = tmp10 * tmp11
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp12, xmask)
def get_args():
arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((1, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_57.run(*args, 960, 25088, grid=grid(960), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_57.benchmark_all_configs(*args, 960, 25088, grid=grid(960))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/kq/ckqzmpbl3qlmroyipqv5z46lwldyab7axrjybp23plftx2joawdv.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.convolution_backward, aten.mul, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_307
# aten.convolution_backward => convolution_backward_35
# aten.mul => mul_629
# aten.native_batch_norm_backward => convert_element_type_495, convert_element_type_497, mul_636, mul_637, sub_122, sub_124, sub_125
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_58 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_58(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr):
xnumel = 24084480
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 196) % 960
tmp0 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp4 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
tmp6 = tl.load(in_ptr2 + (x1), None)
tmp8 = tl.load(in_ptr3 + (x1), None)
tmp11 = tl.load(in_ptr4 + (x1), None)
tmp16 = tl.load(in_ptr5 + (x1), None)
tmp19 = tl.load(in_ptr6 + (x1), None)
tmp2 = tmp0 * tmp1
tmp3 = tmp2.to(tl.float32)
tmp5 = tmp4.to(tl.float32)
tmp7 = tmp5 - tmp6
tmp9 = 3.985969387755102e-05
tmp10 = tmp8 * tmp9
tmp12 = tmp11 * tmp11
tmp13 = tmp10 * tmp12
tmp14 = tmp7 * tmp13
tmp15 = tmp3 - tmp14
tmp17 = tmp16 * tmp9
tmp18 = tmp15 - tmp17
tmp20 = tmp11 * tmp19
tmp21 = tmp18 * tmp20
tmp22 = tmp21.to(tl.float32)
tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None)
def get_args():
arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((1, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_58.run(*args, 24084480, grid=grid(24084480), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_58.benchmark_all_configs(*args, 24084480, grid=grid(24084480))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/v5/cv5qx4oyij3hkwbqtmi4skgsidu4qp4eyvxkday5hyy3p7hf74af.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_498
triton_poi_fused__to_copy_59 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_59(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 153600
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None)
def get_args():
arg_0 = rand_strided((960, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((960, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_59.run(*args, 153600, grid=grid(153600), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_59.benchmark_all_configs(*args, 153600, grid=grid(153600))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/vo/cvowglnuesyuof2jgfmeotsjew6qfwzd2dbspy34jb56dgnewazc.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_304
# aten.native_batch_norm_backward => convert_element_type_499, mul_639, sub_126, sum_40, sum_41
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_60 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_60(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 640
rnumel = 6272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex % 160
x1 = (xindex // 160)
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
tmp5 = tl.load(in_ptr2 + (x0), xmask)
_tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = tl.load(in_ptr0 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp3 = tl.load(in_ptr1 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp4 = tmp3.to(tl.float32)
tmp6 = tmp4 - tmp5
tmp7 = tmp1 * tmp6
_tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tl.store(out_ptr0 + x3, tmp2, xmask)
tmp8 = tl.sum(_tmp8, 1)[:, None]
tl.store(out_ptr1 + x3, tmp8, xmask)
def get_args():
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_60.run(*args, 640, 6272, grid=grid(640), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_60.benchmark_all_configs(*args, 640, 6272, grid=grid(640))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/v7/cv7kgtxh5cmxf5wfqyzhdvjjtikzk2pxpebuocdoaffoo64kkmaj.py
# Original ATen: aten.native_batch_norm_backward
# aten.native_batch_norm_backward => convert_element_type_499, sum_40
triton_per_fused_native_batch_norm_backward_61 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[256, 4],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}
)
@triton.jit
def triton_per_fused_native_batch_norm_backward_61(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 160
rnumel = 4
RBLOCK: tl.constexpr = 4
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (160*r1)), rmask & xmask, other=0)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tl.store(out_ptr0 + x0, tmp3, xmask)
def get_args():
arg_0 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused_native_batch_norm_backward_61.run(*args, 160, 4, grid=grid(160), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused_native_batch_norm_backward_61.benchmark_all_configs(*args, 160, 4, grid=grid(160))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ie/cie5izdtlxlumenajauzzkl6hk7z6q5jgsvyedwv7n3ohhf6bq2s.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_304
# aten.native_batch_norm_backward => convert_element_type_499, mul_639, mul_647, sub_126, sum_41
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_62 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[256, 4],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_62(in_ptr0, in_ptr1, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 160
rnumel = 4
RBLOCK: tl.constexpr = 4
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (160*r1)), rmask & xmask, other=0)
tmp4 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp5 = tmp3 * tmp4
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
tl.store(out_ptr0 + x0, tmp3, xmask)
def get_args():
arg_0 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_62.run(*args, 160, 4, grid=grid(160), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_62.benchmark_all_configs(*args, 160, 4, grid=grid(160))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/lu/clucluq3crlm7gshoftvwdbylf2hruep5zy2kdudpczi5ai6bwrm.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_304
# aten.native_batch_norm_backward => convert_element_type_499, convert_element_type_501, mul_645, mul_646, sub_126, sub_128, sub_129
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_63 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp16', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_63(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4014080
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 196) % 160
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp6 = tl.load(in_ptr3 + (x1), None)
tmp9 = tl.load(in_ptr4 + (x1), None)
tmp14 = tl.load(in_ptr5 + (x1), None)
tmp17 = tl.load(in_ptr6 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp2.to(tl.float32)
tmp5 = tmp3 - tmp4
tmp7 = 3.985969387755102e-05
tmp8 = tmp6 * tmp7
tmp10 = tmp9 * tmp9
tmp11 = tmp8 * tmp10
tmp12 = tmp5 * tmp11
tmp13 = tmp1 - tmp12
tmp15 = tmp14 * tmp7
tmp16 = tmp13 - tmp15
tmp18 = tmp9 * tmp17
tmp19 = tmp16 * tmp18
tmp20 = tmp19.to(tl.float32)
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp20, None)
def get_args():
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_63.run(*args, 4014080, grid=grid(4014080), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_63.benchmark_all_configs(*args, 4014080, grid=grid(4014080))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/u6/cu6t7jgofejrpr6jiujpu5wwc5mlffc5k4hwyiqtb7tiyjxa6ec4.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_502
triton_poi_fused__to_copy_64 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_64(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 19200
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_64.run(*args, 19200, grid=grid(19200), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_64.benchmark_all_configs(*args, 19200, grid=grid(19200))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/rj/crjqpfvxgdlaoka6zisl4dxwjeaxw453cmtsf23px34uebd6u4oo.py
# Original ATen: aten.cat
# aten.cat => cat_48
triton_poi_fused_cat_65 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_65(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 6021120
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 47040
x1 = (xindex // 47040)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (94080*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 240, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_65.run(*args, 6021120, grid=grid(6021120), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_65.benchmark_all_configs(*args, 6021120, grid=grid(6021120))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/rc/crc555gyfmwt6m3oebfawzf77swacndeysz6litdvgmtz7aw3cnb.py
# Original ATen: aten.mul, aten.sigmoid, aten.sigmoid_backward, aten.silu, aten.sum
# aten.mul => mul_648
# aten.sigmoid => sigmoid_47
# aten.sigmoid_backward => convert_element_type_504, convert_element_type_505, convert_element_type_506, mul_650, mul_651, sub_130
# aten.silu => convert_element_type_294, convert_element_type_295, mul_353, sigmoid_45
# aten.sum => sum_42
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_66 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[65536, 256],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_66(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 61440
rnumel = 196
RBLOCK: tl.constexpr = 256
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0)), rmask, other=0).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (r1 + (196*x0)), rmask, other=0).to(tl.float32)
tmp11 = tl.load(in_ptr2 + (x0), None).to(tl.float32)
tmp2 = tmp1.to(tl.float32)
tmp3 = tl.sigmoid(tmp2)
tmp4 = tmp2 * tmp3
tmp5 = tmp4.to(tl.float32)
tmp6 = tmp0 * tmp5
tmp8 = tl.where(rmask, tmp6, 0)
tmp9 = tl.sum(tmp8, 1)[:, None]
tmp10 = tmp9.to(tl.float32)
tmp12 = tl.sigmoid(tmp11)
tmp13 = tmp12.to(tl.float32)
tmp14 = 1.0
tmp15 = tmp14 - tmp13
tmp16 = tmp13 * tmp15
tmp17 = tmp10 * tmp16
tmp18 = tmp17.to(tl.float32)
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp18, None)
def get_args():
arg_0 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_66.run(*args, 61440, 196, grid=grid(61440), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_66.benchmark_all_configs(*args, 61440, 196, grid=grid(61440))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/n4/cn4fg2jpjbb2zgpwsyrzkkvrz4ql4zzxfplbwfmfeem7estzylry.py
# Original ATen: aten._to_copy, aten.convolution_backward
# aten._to_copy => convert_element_type_508
# aten.convolution_backward => sum_43
triton_per_fused__to_copy_convolution_backward_67 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[512, 128],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__to_copy_convolution_backward_67(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 480
rnumel = 128
RBLOCK: tl.constexpr = 128
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (480*r1)), rmask & xmask, other=0).to(tl.float32)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
def get_args():
arg_0 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__to_copy_convolution_backward_67.run(*args, 480, 128, grid=grid(480), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__to_copy_convolution_backward_67.benchmark_all_configs(*args, 480, 128, grid=grid(480))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/gx/cgx4he25cg6tmu62uuz3hbvmsgstxlvguwwpdavmmk5v7cb53khs.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_507
triton_poi_fused__to_copy_68 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_68(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 38400
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_68.run(*args, 38400, grid=grid(38400), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_68.benchmark_all_configs(*args, 38400, grid=grid(38400))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/57/c57nn4lxfd7xfnwbsod5xu57xcqkrswf7vdbyz76bditl462irc4.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_509
triton_poi_fused__to_copy_69 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_69(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 38400
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_69.run(*args, 38400, grid=grid(38400), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_69.benchmark_all_configs(*args, 38400, grid=grid(38400))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/5q/c5qqp6x25kopkdsx6gj3r2vuos77mpwiiyja2ob3sfsgwvxevysq.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub
# aten._native_batch_norm_legit_functional => convert_element_type_292
# aten.add => add_324, add_325
# aten.clone => clone_34
# aten.div => div_5
# aten.fill => full_like_13
# aten.mul => mul_649, mul_655, mul_656, mul_657
# aten.native_batch_norm_backward => convert_element_type_511, mul_658, mul_666, sub_133, sum_45, sum_46
# aten.sigmoid => sigmoid_47, sigmoid_77
# aten.sub => sub_132
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_70 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 32768],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_70(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 480
rnumel = 25088
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
tmp20 = tl.load(in_ptr5 + (x0), xmask)
_tmp23 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 196
r2 = (rindex // 196)
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (x0 + (480*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp4 = tl.load(in_ptr2 + (x0 + (480*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp8 = tl.load(in_ptr3 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp18 = tl.load(in_ptr4 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp0 * tmp2
tmp5 = 196.0
tmp6 = tmp4 / tmp5
tmp7 = tmp3 + tmp6
tmp9 = tl.sigmoid(tmp8)
tmp10 = 1.0
tmp11 = tmp10 - tmp9
tmp12 = tmp8 * tmp11
tmp13 = tmp12 + tmp10
tmp14 = tmp9 * tmp13
tmp15 = tmp7 * tmp14
tmp16 = tmp15.to(tl.float32)
_tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17)
tmp19 = tmp18.to(tl.float32)
tmp21 = tmp19 - tmp20
tmp22 = tmp16 * tmp21
_tmp23 = tl.where(rmask & xmask, _tmp23 + tmp22, _tmp23)
tmp17 = tl.sum(_tmp17, 1)[:, None]
tl.store(out_ptr0 + x0, tmp17, xmask)
tmp23 = tl.sum(_tmp23, 1)[:, None]
tl.store(out_ptr1 + x0, tmp23, xmask)
tmp24 = tl.load(in_ptr6 + (x0), xmask)
tmp25 = tmp23 * tmp24
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp25, xmask)
def get_args():
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_4 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_5 = rand_strided((1, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_8 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_9 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_70.run(*args, 480, 25088, grid=grid(480), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_70.benchmark_all_configs(*args, 480, 25088, grid=grid(480))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/qc/cqcavzge4fgj7thkzvjlxnonwb5xlzaboopggaaauvq4wfdtklsv.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub
# aten._native_batch_norm_legit_functional => convert_element_type_292
# aten.add => add_324, add_325
# aten.clone => clone_34
# aten.div => div_5
# aten.fill => full_like_13
# aten.mul => mul_649, mul_655, mul_656, mul_657
# aten.native_batch_norm_backward => convert_element_type_511, mul_664, sub_133, sub_135, sub_136
# aten.sigmoid => sigmoid_47, sigmoid_77
# aten.sub => sub_132
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_71 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_71(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 12042240
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x4 = (xindex // 196)
x1 = (xindex // 196) % 480
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32)
tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32)
tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32)
tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32)
tmp19 = tl.load(in_ptr5 + (x1), None)
tmp21 = tl.load(in_ptr6 + (x1), None)
tmp24 = tl.load(in_ptr7 + (x1), None)
tmp29 = tl.load(in_ptr8 + (x1), None)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp0 * tmp2
tmp5 = 196.0
tmp6 = tmp4 / tmp5
tmp7 = tmp3 + tmp6
tmp9 = tl.sigmoid(tmp8)
tmp10 = 1.0
tmp11 = tmp10 - tmp9
tmp12 = tmp8 * tmp11
tmp13 = tmp12 + tmp10
tmp14 = tmp9 * tmp13
tmp15 = tmp7 * tmp14
tmp16 = tmp15.to(tl.float32)
tmp18 = tmp17.to(tl.float32)
tmp20 = tmp18 - tmp19
tmp22 = 3.985969387755102e-05
tmp23 = tmp21 * tmp22
tmp25 = tmp24 * tmp24
tmp26 = tmp23 * tmp25
tmp27 = tmp20 * tmp26
tmp28 = tmp16 - tmp27
tmp30 = tmp29 * tmp22
tmp31 = tmp28 - tmp30
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp31, None)
def get_args():
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_4 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_5 = rand_strided((1, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_8 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_9 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_71.run(*args, 12042240, grid=grid(12042240), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_71.benchmark_all_configs(*args, 12042240, grid=grid(12042240))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ya/cya3odioikqnsayl73izdekuxqhs5rqfkr2pft6i3rty3tyluanc.py
# Original ATen: aten.convolution_backward
# aten.convolution_backward => convolution_backward_40
triton_poi_fused_convolution_backward_72 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_72(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3010560
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = (xindex // 23520)
x3 = xindex % 23520
x1 = (xindex // 196) % 120
x4 = xindex
tmp0 = tl.load(in_ptr0 + (70560 + x3 + (94080*x2)), None)
tmp1 = tl.load(in_ptr1 + (360 + x1), None)
tmp2 = tl.load(in_ptr2 + (360 + x1), None)
tmp3 = tmp1 * tmp2
tmp4 = tmp0 * tmp3
tmp5 = tmp4.to(tl.float32)
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)
def get_args():
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_convolution_backward_72.run(*args, 3010560, grid=grid(3010560), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_convolution_backward_72.benchmark_all_configs(*args, 3010560, grid=grid(3010560))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/fe/cfetbcwbedjsjshgd4bjuqpljqnapb5dcgp46ajfwwvj2dk7ne2i.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_514
triton_poi_fused__to_copy_73 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_73(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 9720
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_73.run(*args, 9720, grid=grid(9720), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_73.benchmark_all_configs(*args, 9720, grid=grid(9720))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/iq/ciqofepsagke43sfmdrbzwfpkfw4zvr3els5r3zwk7gigvnpcypw.py
# Original ATen: aten.convolution_backward
# aten.convolution_backward => convolution_backward_41
triton_poi_fused_convolution_backward_74 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_74(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3010560
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = (xindex // 23520)
x3 = xindex % 23520
x1 = (xindex // 196) % 120
x4 = xindex
tmp0 = tl.load(in_ptr0 + (47040 + x3 + (94080*x2)), None)
tmp1 = tl.load(in_ptr1 + (240 + x1), None)
tmp2 = tl.load(in_ptr2 + (240 + x1), None)
tmp3 = tmp1 * tmp2
tmp4 = tmp0 * tmp3
tmp5 = tmp4.to(tl.float32)
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)
def get_args():
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_convolution_backward_74.run(*args, 3010560, grid=grid(3010560), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_convolution_backward_74.benchmark_all_configs(*args, 3010560, grid=grid(3010560))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/dy/cdybcagplfulpsthbc6wmswmxelkrixxqqrrt72daichdjv2sefi.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_515
triton_poi_fused__to_copy_75 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_75(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 5880
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_75.run(*args, 5880, grid=grid(5880), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_75.benchmark_all_configs(*args, 5880, grid=grid(5880))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/uy/cuyidmj6qefewok45jcr4rfmsmcj6hczovevzzaxuc52uf6hh3nc.py
# Original ATen: aten.convolution_backward
# aten.convolution_backward => convolution_backward_42
triton_poi_fused_convolution_backward_76 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_76(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3010560
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = (xindex // 23520)
x3 = xindex % 23520
x1 = (xindex // 196) % 120
x4 = xindex
tmp0 = tl.load(in_ptr0 + (23520 + x3 + (94080*x2)), None)
tmp1 = tl.load(in_ptr1 + (120 + x1), None)
tmp2 = tl.load(in_ptr2 + (120 + x1), None)
tmp3 = tmp1 * tmp2
tmp4 = tmp0 * tmp3
tmp5 = tmp4.to(tl.float32)
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)
def get_args():
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_convolution_backward_76.run(*args, 3010560, grid=grid(3010560), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_convolution_backward_76.benchmark_all_configs(*args, 3010560, grid=grid(3010560))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ux/cuxkdni67km5gfbdihbo5i44iivskoy6hdt7gubbh5qgb5mkwsob.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_516
triton_poi_fused__to_copy_77 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_77(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3000
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_77.run(*args, 3000, grid=grid(3000), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_77.benchmark_all_configs(*args, 3000, grid=grid(3000))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/4r/c4rld6qtfki5aedwz7g3eqwmnt7w4mmzwymiotpz4adjvg43hqwk.py
# Original ATen: aten.convolution_backward
# aten.convolution_backward => convolution_backward_43
triton_poi_fused_convolution_backward_78 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_78(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3010560
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = (xindex // 23520)
x3 = xindex % 23520
x1 = (xindex // 196) % 120
x4 = xindex
tmp0 = tl.load(in_ptr0 + (x3 + (94080*x2)), None)
tmp1 = tl.load(in_ptr1 + (x1), None)
tmp2 = tl.load(in_ptr2 + (x1), None)
tmp3 = tmp1 * tmp2
tmp4 = tmp0 * tmp3
tmp5 = tmp4.to(tl.float32)
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)
def get_args():
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_convolution_backward_78.run(*args, 3010560, grid=grid(3010560), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_convolution_backward_78.benchmark_all_configs(*args, 3010560, grid=grid(3010560))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ri/crixsdzqwpepdejipqi3ixt4vlyrmrskq5hgkzn7jumpuurao5h5.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_517
triton_poi_fused__to_copy_79 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_79(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1080
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_79.run(*args, 1080, grid=grid(1080), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_79.benchmark_all_configs(*args, 1080, grid=grid(1080))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ty/ctycfmevxfpn542kh5agvjqilg3o5r77udr2h3wyqu23mqhlqdxf.py
# Original ATen: aten.cat
# aten.cat => cat_49
triton_poi_fused_cat_80 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_80(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3010560
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 23520
x1 = (xindex // 23520)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (94080*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 120, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_80.run(*args, 3010560, grid=grid(3010560), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_80.benchmark_all_configs(*args, 3010560, grid=grid(3010560))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/7i/c7iiiqxahzgxonlzcskqwzfumjrx6cupqeinuz5pkzk66wqnnnyc.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mul, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_284
# aten.mul => mul_669
# aten.native_batch_norm_backward => convert_element_type_518, mul_670, mul_678, sub_138, sum_47, sum_48
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_81 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 32768],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_81(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 480
rnumel = 25088
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
tmp7 = tl.load(in_ptr3 + (x0), xmask)
_tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 196
r2 = (rindex // 196)
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp5 = tl.load(in_ptr2 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp2 = tmp0 * tmp1
tmp3 = tmp2.to(tl.float32)
_tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4)
tmp6 = tmp5.to(tl.float32)
tmp8 = tmp6 - tmp7
tmp9 = tmp3 * tmp8
_tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10)
tmp4 = tl.sum(_tmp4, 1)[:, None]
tl.store(out_ptr0 + x0, tmp4, xmask)
tmp10 = tl.sum(_tmp10, 1)[:, None]
tl.store(out_ptr1 + x0, tmp10, xmask)
tmp11 = tl.load(in_ptr4 + (x0), xmask)
tmp12 = tmp10 * tmp11
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp12, xmask)
def get_args():
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((1, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_81.run(*args, 480, 25088, grid=grid(480), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_81.benchmark_all_configs(*args, 480, 25088, grid=grid(480))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/v4/cv4k7goz3fg6grlbcjwhdwg6ck3vmlxtj3l76l5phqjwc5cao5b3.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mul, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_284
# aten.mul => mul_669
# aten.native_batch_norm_backward => convert_element_type_518, convert_element_type_520, mul_676, mul_677, sub_138, sub_140, sub_141
triton_poi_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_82 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_82(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr):
xnumel = 12042240
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 196) % 480
tmp0 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp4 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
tmp6 = tl.load(in_ptr2 + (x1), None)
tmp8 = tl.load(in_ptr3 + (x1), None)
tmp11 = tl.load(in_ptr4 + (x1), None)
tmp16 = tl.load(in_ptr5 + (x1), None)
tmp19 = tl.load(in_ptr6 + (x1), None)
tmp2 = tmp0 * tmp1
tmp3 = tmp2.to(tl.float32)
tmp5 = tmp4.to(tl.float32)
tmp7 = tmp5 - tmp6
tmp9 = 3.985969387755102e-05
tmp10 = tmp8 * tmp9
tmp12 = tmp11 * tmp11
tmp13 = tmp10 * tmp12
tmp14 = tmp7 * tmp13
tmp15 = tmp3 - tmp14
tmp17 = tmp16 * tmp9
tmp18 = tmp15 - tmp17
tmp20 = tmp11 * tmp19
tmp21 = tmp18 * tmp20
tmp22 = tmp21.to(tl.float32)
tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None)
def get_args():
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((1, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_82.run(*args, 12042240, grid=grid(12042240), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_82.benchmark_all_configs(*args, 12042240, grid=grid(12042240))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/om/comunyk6xj4yo26dm6rnf6zkadn3bzlwf2dwxosduxrcmjkwwa6u.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_521
triton_poi_fused__to_copy_83 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_83(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 19200
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_83.run(*args, 19200, grid=grid(19200), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_83.benchmark_all_configs(*args, 19200, grid=grid(19200))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/2y/c2yommjpqgw2nsvcapep2mob6welg6j7akfq7uc4xaaiuzwdalt7.py
# Original ATen: aten.cat
# aten.cat => cat_50
triton_poi_fused_cat_84 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_84(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2007040
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 15680
x1 = (xindex // 15680)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (31360*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 80, 14, 14), (15680, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 80, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_84.run(*args, 2007040, grid=grid(2007040), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_84.benchmark_all_configs(*args, 2007040, grid=grid(2007040))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/zy/czyhduwjwdj7bjl64zkqqaz3rymv5xt54uovrwuvmbs3ylr4n3xb.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_280
# aten.add => add_327
# aten.native_batch_norm_backward => convert_element_type_523, mul_679, sub_142, sum_49, sum_50
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_85 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: 'i32', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_85(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 640
rnumel = 6272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex % 160
x1 = (xindex // 160)
_tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
tmp7 = tl.load(in_ptr3 + (x0), xmask)
_tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = tl.load(in_ptr0 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tl.load(in_ptr1 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp5 = tl.load(in_ptr2 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp2 = tmp0 + tmp1
tmp3 = tmp2.to(tl.float32)
_tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4)
tmp6 = tmp5.to(tl.float32)
tmp8 = tmp6 - tmp7
tmp9 = tmp3 * tmp8
_tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10)
tmp4 = tl.sum(_tmp4, 1)[:, None]
tl.store(out_ptr0 + x3, tmp4, xmask)
tmp10 = tl.sum(_tmp10, 1)[:, None]
tl.store(out_ptr1 + x3, tmp10, xmask)
def get_args():
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_85.run(*args, 640, 6272, grid=grid(640), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_85.benchmark_all_configs(*args, 640, 6272, grid=grid(640))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/5n/c5ngqp5cp6ieu5sc7n3ylgpltjkzcel7tc4dwx3ctbhj3ayamet2.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_280
# aten.add => add_327
# aten.native_batch_norm_backward => convert_element_type_523, convert_element_type_525, mul_685, mul_686, sub_142, sub_144, sub_145
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_86 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp16', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_86(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4014080
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 196) % 160
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
tmp4 = tl.load(in_ptr2 + (x3), None).to(tl.float32)
tmp6 = tl.load(in_ptr3 + (x1), None)
tmp8 = tl.load(in_ptr4 + (x1), None)
tmp11 = tl.load(in_ptr5 + (x1), None)
tmp16 = tl.load(in_ptr6 + (x1), None)
tmp19 = tl.load(in_ptr7 + (x1), None)
tmp2 = tmp0 + tmp1
tmp3 = tmp2.to(tl.float32)
tmp5 = tmp4.to(tl.float32)
tmp7 = tmp5 - tmp6
tmp9 = 3.985969387755102e-05
tmp10 = tmp8 * tmp9
tmp12 = tmp11 * tmp11
tmp13 = tmp10 * tmp12
tmp14 = tmp7 * tmp13
tmp15 = tmp3 - tmp14
tmp17 = tmp16 * tmp9
tmp18 = tmp15 - tmp17
tmp20 = tmp11 * tmp19
tmp21 = tmp18 * tmp20
tmp22 = tmp21.to(tl.float32)
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None)
def get_args():
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_8 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_86.run(*args, 4014080, grid=grid(4014080), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_86.benchmark_all_configs(*args, 4014080, grid=grid(4014080))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/mi/cmi2g34rcnn4o7hkiq27yemv23mvhmf7ilu2wl6n5igdib4lbn3v.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_256
# aten.add => add_327, add_332
# aten.native_batch_norm_backward => convert_element_type_547, mul_719, sub_158, sum_58, sum_59
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_87 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_87(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 640
rnumel = 6272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex % 160
x1 = (xindex // 160)
_tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
tmp9 = tl.load(in_ptr4 + (x0), xmask)
_tmp12 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = tl.load(in_ptr0 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tl.load(in_ptr1 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp3 = tl.load(in_ptr2 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp7 = tl.load(in_ptr3 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp2 = tmp0 + tmp1
tmp4 = tmp2 + tmp3
tmp5 = tmp4.to(tl.float32)
_tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6)
tmp8 = tmp7.to(tl.float32)
tmp10 = tmp8 - tmp9
tmp11 = tmp5 * tmp10
_tmp12 = tl.where(rmask & xmask, _tmp12 + tmp11, _tmp12)
tmp6 = tl.sum(_tmp6, 1)[:, None]
tl.store(out_ptr0 + x3, tmp6, xmask)
tmp12 = tl.sum(_tmp12, 1)[:, None]
tl.store(out_ptr1 + x3, tmp12, xmask)
def get_args():
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_4 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_87.run(*args, 640, 6272, grid=grid(640), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_87.benchmark_all_configs(*args, 640, 6272, grid=grid(640))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/5m/c5msj6p2vdag4fs3dze5gd2kastkwiprxfqobb4ipbu3zwk6y4tb.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_256
# aten.add => add_327, add_332
# aten.native_batch_norm_backward => convert_element_type_547, mul_725, mul_726, sub_158, sub_160, sub_161
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_88 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_88(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4014080
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 196) % 160
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
tmp3 = tl.load(in_ptr2 + (x3), None).to(tl.float32)
tmp6 = tl.load(in_ptr3 + (x3), None).to(tl.float32)
tmp8 = tl.load(in_ptr4 + (x1), None)
tmp10 = tl.load(in_ptr5 + (x1), None)
tmp13 = tl.load(in_ptr6 + (x1), None)
tmp18 = tl.load(in_ptr7 + (x1), None)
tmp21 = tl.load(in_ptr8 + (x1), None)
tmp2 = tmp0 + tmp1
tmp4 = tmp2 + tmp3
tmp5 = tmp4.to(tl.float32)
tmp7 = tmp6.to(tl.float32)
tmp9 = tmp7 - tmp8
tmp11 = 3.985969387755102e-05
tmp12 = tmp10 * tmp11
tmp14 = tmp13 * tmp13
tmp15 = tmp12 * tmp14
tmp16 = tmp9 * tmp15
tmp17 = tmp5 - tmp16
tmp19 = tmp18 * tmp11
tmp20 = tmp17 - tmp19
tmp22 = tmp13 * tmp21
tmp23 = tmp20 * tmp22
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp23, None)
def get_args():
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_4 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_8 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_9 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_88.run(*args, 4014080, grid=grid(4014080), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_88.benchmark_all_configs(*args, 4014080, grid=grid(4014080))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/33/c33mk7jbf3hbussw7bzc7k6jlfrdr6q6k2jcx5wwhkb43usq3f3k.py
# Original ATen: aten.convolution_backward
# aten.convolution_backward => convolution_backward_56
triton_poi_fused_convolution_backward_89 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_89(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2007040
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 15680
x1 = (xindex // 15680)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (15680 + x0 + (31360*x1)), None)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, None)
def get_args():
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((128, 80, 14, 14), (15680, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_convolution_backward_89.run(*args, 2007040, grid=grid(2007040), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_convolution_backward_89.benchmark_all_configs(*args, 2007040, grid=grid(2007040))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/qs/cqsgmcz5zjvb6ko7kxhkl5io7i4gmvcpy5zxwjwu7qjx5oya6hd3.py
# Original ATen: aten.convolution_backward
# aten.convolution_backward => convolution_backward_57
triton_poi_fused_convolution_backward_90 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_90(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2007040
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 15680
x1 = (xindex // 15680)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (31360*x1)), None)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, None)
def get_args():
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((128, 80, 14, 14), (15680, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_convolution_backward_90.run(*args, 2007040, grid=grid(2007040), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_convolution_backward_90.benchmark_all_configs(*args, 2007040, grid=grid(2007040))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/is/cis35c7i52ytm3zlhduqhkylrq33chwilt3nh2nr2nfagpbn72d4.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_232
# aten.add => add_327, add_332, add_337
# aten.native_batch_norm_backward => convert_element_type_571, mul_759, sub_174, sum_67, sum_68
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_91 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_91(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 640
rnumel = 6272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex % 160
x1 = (xindex // 160)
_tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
tmp11 = tl.load(in_ptr5 + (x0), xmask)
_tmp14 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = tl.load(in_ptr0 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tl.load(in_ptr1 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp3 = tl.load(in_ptr2 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp5 = tl.load(in_ptr3 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp9 = tl.load(in_ptr4 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp2 = tmp0 + tmp1
tmp4 = tmp2 + tmp3
tmp6 = tmp4 + tmp5
tmp7 = tmp6.to(tl.float32)
_tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8)
tmp10 = tmp9.to(tl.float32)
tmp12 = tmp10 - tmp11
tmp13 = tmp7 * tmp12
_tmp14 = tl.where(rmask & xmask, _tmp14 + tmp13, _tmp14)
tmp8 = tl.sum(_tmp8, 1)[:, None]
tl.store(out_ptr0 + x3, tmp8, xmask)
tmp14 = tl.sum(_tmp14, 1)[:, None]
tl.store(out_ptr1 + x3, tmp14, xmask)
def get_args():
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_4 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_5 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_91.run(*args, 640, 6272, grid=grid(640), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_91.benchmark_all_configs(*args, 640, 6272, grid=grid(640))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/lj/cljkpj67jhpiomhay7l2fuqfmjz2fox7ndvjbixox242vrhfypjf.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.convolution_backward, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_232
# aten.add => add_327, add_332, add_337
# aten.convolution_backward => convolution_backward_66
# aten.native_batch_norm_backward => convert_element_type_571, convert_element_type_573, mul_765, mul_766, sub_174, sub_176, sub_177
triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_92 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: '*fp16', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_92(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, in_ptr9, out_ptr1, xnumel, XBLOCK : tl.constexpr):
xnumel = 4014080
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 196) % 160
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
tmp3 = tl.load(in_ptr2 + (x3), None).to(tl.float32)
tmp5 = tl.load(in_ptr3 + (x3), None).to(tl.float32)
tmp8 = tl.load(in_ptr4 + (x3), None).to(tl.float32)
tmp10 = tl.load(in_ptr5 + (x1), None)
tmp12 = tl.load(in_ptr6 + (x1), None)
tmp15 = tl.load(in_ptr7 + (x1), None)
tmp20 = tl.load(in_ptr8 + (x1), None)
tmp23 = tl.load(in_ptr9 + (x1), None)
tmp2 = tmp0 + tmp1
tmp4 = tmp2 + tmp3
tmp6 = tmp4 + tmp5
tmp7 = tmp6.to(tl.float32)
tmp9 = tmp8.to(tl.float32)
tmp11 = tmp9 - tmp10
tmp13 = 3.985969387755102e-05
tmp14 = tmp12 * tmp13
tmp16 = tmp15 * tmp15
tmp17 = tmp14 * tmp16
tmp18 = tmp11 * tmp17
tmp19 = tmp7 - tmp18
tmp21 = tmp20 * tmp13
tmp22 = tmp19 - tmp21
tmp24 = tmp15 * tmp23
tmp25 = tmp22 * tmp24
tmp26 = tmp25.to(tl.float32)
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp26, None)
def get_args():
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_4 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_5 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_8 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_9 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_10 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, arg_10,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_92.run(*args, 4014080, grid=grid(4014080), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_92.benchmark_all_configs(*args, 4014080, grid=grid(4014080))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/af/cafjjp3a3kwtimi2i4kv3zpwxjdfuxdqkq7szn2vypz2ywexppw7.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_574
triton_poi_fused__to_copy_93 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_93(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 99840
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((160, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((160, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_93.run(*args, 99840, grid=grid(99840), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_93.benchmark_all_configs(*args, 99840, grid=grid(99840))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ic/cicgaxe2oqufzttvfjscmbopri2uxa3mejzwrav4jbahpmbb4rby.py
# Original ATen: aten.mul, aten.sigmoid, aten.sigmoid_backward, aten.silu, aten.sum
# aten.mul => mul_768
# aten.sigmoid => sigmoid_35
# aten.sigmoid_backward => convert_element_type_575, convert_element_type_576, convert_element_type_577, mul_770, mul_771, sub_178
# aten.silu => convert_element_type_223, convert_element_type_224, mul_278, sigmoid_33
# aten.sum => sum_69
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_94 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[131072, 256],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_94(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 79872
rnumel = 196
RBLOCK: tl.constexpr = 256
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0)), rmask, other=0).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (r1 + (196*x0)), rmask, other=0).to(tl.float32)
tmp11 = tl.load(in_ptr2 + (x0), None).to(tl.float32)
tmp2 = tmp1.to(tl.float32)
tmp3 = tl.sigmoid(tmp2)
tmp4 = tmp2 * tmp3
tmp5 = tmp4.to(tl.float32)
tmp6 = tmp0 * tmp5
tmp8 = tl.where(rmask, tmp6, 0)
tmp9 = tl.sum(tmp8, 1)[:, None]
tmp10 = tmp9.to(tl.float32)
tmp12 = tl.sigmoid(tmp11)
tmp13 = tmp12.to(tl.float32)
tmp14 = 1.0
tmp15 = tmp14 - tmp13
tmp16 = tmp13 * tmp15
tmp17 = tmp10 * tmp16
tmp18 = tmp17.to(tl.float32)
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp18, None)
def get_args():
arg_0 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_94.run(*args, 79872, 196, grid=grid(79872), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_94.benchmark_all_configs(*args, 79872, 196, grid=grid(79872))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/sw/cswrgzqc55nc76ttfypz7ruvqwdkjksusjytknavxbcop6g2otp2.py
# Original ATen: aten._to_copy, aten.convolution_backward
# aten._to_copy => convert_element_type_579
# aten.convolution_backward => sum_70
triton_per_fused__to_copy_convolution_backward_95 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[1024, 128],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__to_copy_convolution_backward_95(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 624
rnumel = 128
RBLOCK: tl.constexpr = 128
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (624*r1)), rmask & xmask, other=0).to(tl.float32)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
def get_args():
arg_0 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__to_copy_convolution_backward_95.run(*args, 624, 128, grid=grid(624), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__to_copy_convolution_backward_95.benchmark_all_configs(*args, 624, 128, grid=grid(624))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/rk/crkgwzo3s6rzbbnalsvhwyanngv4n2zgzifhbhpcnc5ysqffl6ac.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_578
triton_poi_fused__to_copy_96 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_96(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 32448
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((624, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((624, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_96.run(*args, 32448, grid=grid(32448), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_96.benchmark_all_configs(*args, 32448, grid=grid(32448))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/wf/cwfpzmbbmgs237i4rn23twa6vqls763iyjrp76njdwwsj3et2uaf.py
# Original ATen: aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub
# aten.add => add_338
# aten.clone => clone_26
# aten.fill => full_like_21
# aten.mul => mul_772, mul_773, mul_774
# aten.sigmoid => sigmoid_85
# aten.sub => sub_179
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_97 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_add_clone_fill_mul_sigmoid_sub_97(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 6656
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = 1.0
tmp4 = tmp3 - tmp2
tmp5 = tmp1 * tmp4
tmp6 = tmp5 + tmp3
tmp7 = tmp2 * tmp6
tmp8 = tmp0 * tmp7
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp8, xmask)
def get_args():
arg_0 = rand_strided((128, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_97.run(*args, 6656, grid=grid(6656), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_add_clone_fill_mul_sigmoid_sub_97.benchmark_all_configs(*args, 6656, grid=grid(6656))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/xt/cxt55nhjkmnbxqvfatybarolu6uxihwoo3cct34gda356pn7scly.py
# Original ATen: aten._to_copy, aten.convolution_backward
# aten._to_copy => convert_element_type_581
# aten.convolution_backward => sum_71
triton_per_fused__to_copy_convolution_backward_98 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[64, 128],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__to_copy_convolution_backward_98(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 52
rnumel = 128
RBLOCK: tl.constexpr = 128
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (52*r1)), rmask & xmask, other=0).to(tl.float32)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
def get_args():
arg_0 = rand_strided((128, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((52,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__to_copy_convolution_backward_98.run(*args, 52, 128, grid=grid(52), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__to_copy_convolution_backward_98.benchmark_all_configs(*args, 52, 128, grid=grid(52))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/tw/ctwfvbwslvyki3iv4byjhnpgwt4ujawez7njntlfraulnuse3ddu.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_580
triton_poi_fused__to_copy_99 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_99(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 32448
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((52, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((52, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_99.run(*args, 32448, grid=grid(32448), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_99.benchmark_all_configs(*args, 32448, grid=grid(32448))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/6w/c6wzolzrfmxblwjxnpxkgqezmro2wot3nkz62hajpunjmpkn6enk.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub
# aten._native_batch_norm_legit_functional => convert_element_type_221
# aten.add => add_339, add_340
# aten.clone => clone_25
# aten.div => div_8
# aten.fill => full_like_22
# aten.mul => mul_769, mul_775, mul_776, mul_777
# aten.native_batch_norm_backward => convert_element_type_582, mul_778, mul_786, sub_181, sum_72, sum_73
# aten.sigmoid => sigmoid_35, sigmoid_86
# aten.sub => sub_180
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_100 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 32768],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_100(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 624
rnumel = 25088
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
tmp20 = tl.load(in_ptr5 + (x0), xmask)
_tmp23 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 196
r2 = (rindex // 196)
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (x0 + (624*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp4 = tl.load(in_ptr2 + (x0 + (624*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp8 = tl.load(in_ptr3 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp18 = tl.load(in_ptr4 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp0 * tmp2
tmp5 = 196.0
tmp6 = tmp4 / tmp5
tmp7 = tmp3 + tmp6
tmp9 = tl.sigmoid(tmp8)
tmp10 = 1.0
tmp11 = tmp10 - tmp9
tmp12 = tmp8 * tmp11
tmp13 = tmp12 + tmp10
tmp14 = tmp9 * tmp13
tmp15 = tmp7 * tmp14
tmp16 = tmp15.to(tl.float32)
_tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17)
tmp19 = tmp18.to(tl.float32)
tmp21 = tmp19 - tmp20
tmp22 = tmp16 * tmp21
_tmp23 = tl.where(rmask & xmask, _tmp23 + tmp22, _tmp23)
tmp17 = tl.sum(_tmp17, 1)[:, None]
tl.store(out_ptr0 + x0, tmp17, xmask)
tmp23 = tl.sum(_tmp23, 1)[:, None]
tl.store(out_ptr1 + x0, tmp23, xmask)
tmp24 = tl.load(in_ptr6 + (x0), xmask)
tmp25 = tmp23 * tmp24
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp25, xmask)
def get_args():
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_4 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_5 = rand_strided((1, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_8 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_9 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_100.run(*args, 624, 25088, grid=grid(624), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_100.benchmark_all_configs(*args, 624, 25088, grid=grid(624))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/d3/cd3muetrknv5a272355llgd7oce4qcl4nznkklo4quwqquswhe2k.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.convolution_backward, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub
# aten._native_batch_norm_legit_functional => convert_element_type_221
# aten.add => add_339, add_340
# aten.clone => clone_25
# aten.convolution_backward => convolution_backward_69
# aten.div => div_8
# aten.fill => full_like_22
# aten.mul => mul_769, mul_775, mul_776, mul_777
# aten.native_batch_norm_backward => convert_element_type_582, convert_element_type_584, mul_784, mul_785, sub_181, sub_183, sub_184
# aten.sigmoid => sigmoid_35, sigmoid_86
# aten.sub => sub_180
triton_poi_fused__native_batch_norm_legit_functional_add_clone_convolution_backward_div_fill_mul_native_batch_norm_backward_sigmoid_sub_101 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: '*fp16', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_convolution_backward_div_fill_mul_native_batch_norm_backward_sigmoid_sub_101(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, in_ptr9, out_ptr1, xnumel, XBLOCK : tl.constexpr):
xnumel = 15654912
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x4 = (xindex // 196)
x1 = (xindex // 196) % 624
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32)
tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32)
tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32)
tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32)
tmp19 = tl.load(in_ptr5 + (x1), None)
tmp21 = tl.load(in_ptr6 + (x1), None)
tmp24 = tl.load(in_ptr7 + (x1), None)
tmp29 = tl.load(in_ptr8 + (x1), None)
tmp32 = tl.load(in_ptr9 + (x1), None)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp0 * tmp2
tmp5 = 196.0
tmp6 = tmp4 / tmp5
tmp7 = tmp3 + tmp6
tmp9 = tl.sigmoid(tmp8)
tmp10 = 1.0
tmp11 = tmp10 - tmp9
tmp12 = tmp8 * tmp11
tmp13 = tmp12 + tmp10
tmp14 = tmp9 * tmp13
tmp15 = tmp7 * tmp14
tmp16 = tmp15.to(tl.float32)
tmp18 = tmp17.to(tl.float32)
tmp20 = tmp18 - tmp19
tmp22 = 3.985969387755102e-05
tmp23 = tmp21 * tmp22
tmp25 = tmp24 * tmp24
tmp26 = tmp23 * tmp25
tmp27 = tmp20 * tmp26
tmp28 = tmp16 - tmp27
tmp30 = tmp29 * tmp22
tmp31 = tmp28 - tmp30
tmp33 = tmp24 * tmp32
tmp34 = tmp31 * tmp33
tmp35 = tmp34.to(tl.float32)
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp35, None)
def get_args():
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_4 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_5 = rand_strided((1, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_8 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_9 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_10 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, arg_10,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_convolution_backward_div_fill_mul_native_batch_norm_backward_sigmoid_sub_101.run(*args, 15654912, grid=grid(15654912), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_convolution_backward_div_fill_mul_native_batch_norm_backward_sigmoid_sub_101.benchmark_all_configs(*args, 15654912, grid=grid(15654912))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/23/c23cf4zcm2zsqnix7cwevl5uqnq6e7k4nrleu5hxvvucflqj7ztk.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_585
triton_poi_fused__to_copy_102 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_102(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 5616
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((624, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((624, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_102.run(*args, 5616, grid=grid(5616), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_102.benchmark_all_configs(*args, 5616, grid=grid(5616))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/xe/cxe5wowu7imk523mjj4gviz3cjwfkjjg7yenewa23cv4xlyi265g.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mul, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_216
# aten.mul => mul_789
# aten.native_batch_norm_backward => convert_element_type_586, mul_790, mul_798, sub_186, sum_74, sum_75
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_103 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 32768],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_103(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 624
rnumel = 25088
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
tmp7 = tl.load(in_ptr3 + (x0), xmask)
_tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 196
r2 = (rindex // 196)
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp5 = tl.load(in_ptr2 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp2 = tmp0 * tmp1
tmp3 = tmp2.to(tl.float32)
_tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4)
tmp6 = tmp5.to(tl.float32)
tmp8 = tmp6 - tmp7
tmp9 = tmp3 * tmp8
_tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10)
tmp4 = tl.sum(_tmp4, 1)[:, None]
tl.store(out_ptr0 + x0, tmp4, xmask)
tmp10 = tl.sum(_tmp10, 1)[:, None]
tl.store(out_ptr1 + x0, tmp10, xmask)
tmp11 = tl.load(in_ptr4 + (x0), xmask)
tmp12 = tmp10 * tmp11
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp12, xmask)
def get_args():
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((1, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_103.run(*args, 624, 25088, grid=grid(624), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_103.benchmark_all_configs(*args, 624, 25088, grid=grid(624))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/n6/cn6vgxz3swb2xjokxek7wjtmjhdz6hdzplejuatnteeyvplenl3a.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.convolution_backward, aten.mul, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_216
# aten.convolution_backward => convolution_backward_70
# aten.mul => mul_789
# aten.native_batch_norm_backward => convert_element_type_586, convert_element_type_588, mul_796, mul_797, sub_186, sub_188, sub_189
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_104 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_104(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr):
xnumel = 15654912
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 196) % 624
tmp0 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp4 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
tmp6 = tl.load(in_ptr2 + (x1), None)
tmp8 = tl.load(in_ptr3 + (x1), None)
tmp11 = tl.load(in_ptr4 + (x1), None)
tmp16 = tl.load(in_ptr5 + (x1), None)
tmp19 = tl.load(in_ptr6 + (x1), None)
tmp2 = tmp0 * tmp1
tmp3 = tmp2.to(tl.float32)
tmp5 = tmp4.to(tl.float32)
tmp7 = tmp5 - tmp6
tmp9 = 3.985969387755102e-05
tmp10 = tmp8 * tmp9
tmp12 = tmp11 * tmp11
tmp13 = tmp10 * tmp12
tmp14 = tmp7 * tmp13
tmp15 = tmp3 - tmp14
tmp17 = tmp16 * tmp9
tmp18 = tmp15 - tmp17
tmp20 = tmp11 * tmp19
tmp21 = tmp18 * tmp20
tmp22 = tmp21.to(tl.float32)
tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None)
def get_args():
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((1, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_104.run(*args, 15654912, grid=grid(15654912), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_104.benchmark_all_configs(*args, 15654912, grid=grid(15654912))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/iv/civ2oft43774ffyg3v5kh23rmteh5qvbuafa2jrfla6exeuvk3sq.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_589
triton_poi_fused__to_copy_105 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_105(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 64896
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((624, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((624, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_105.run(*args, 64896, grid=grid(64896), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_105.benchmark_all_configs(*args, 64896, grid=grid(64896))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/j7/cj7ui4cmg7zcaen4xj4vc2j54wf3j5vlszxmshcee37dzcpqxdcd.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_213
# aten.native_batch_norm_backward => convert_element_type_590, mul_799, sub_190, sum_76, sum_77
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_106 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_106(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 416
rnumel = 6272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex % 104
x1 = (xindex // 104)
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
tmp5 = tl.load(in_ptr2 + (x0), xmask)
_tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = tl.load(in_ptr0 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp3 = tl.load(in_ptr1 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp4 = tmp3.to(tl.float32)
tmp6 = tmp4 - tmp5
tmp7 = tmp1 * tmp6
_tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tl.store(out_ptr0 + x3, tmp2, xmask)
tmp8 = tl.sum(_tmp8, 1)[:, None]
tl.store(out_ptr1 + x3, tmp8, xmask)
def get_args():
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_106.run(*args, 416, 6272, grid=grid(416), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_106.benchmark_all_configs(*args, 416, 6272, grid=grid(416))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/at/catlmgmveca6eegfxgggc25rzm3umawxz52w4d3665iwc5nkqdhv.py
# Original ATen: aten.native_batch_norm_backward
# aten.native_batch_norm_backward => convert_element_type_590, sum_76
triton_per_fused_native_batch_norm_backward_107 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[128, 4],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}
)
@triton.jit
def triton_per_fused_native_batch_norm_backward_107(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 104
rnumel = 4
RBLOCK: tl.constexpr = 4
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (104*r1)), rmask & xmask, other=0)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tl.store(out_ptr0 + x0, tmp3, xmask)
def get_args():
arg_0 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused_native_batch_norm_backward_107.run(*args, 104, 4, grid=grid(104), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused_native_batch_norm_backward_107.benchmark_all_configs(*args, 104, 4, grid=grid(104))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/sw/cswvpkxc2jcvqnphnzcluqix5sdpoevu3y2llapfjncqthtvmuif.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_213
# aten.native_batch_norm_backward => convert_element_type_590, mul_799, mul_807, sub_190, sum_77
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_108 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[128, 4],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_108(in_ptr0, in_ptr1, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 104
rnumel = 4
RBLOCK: tl.constexpr = 4
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (104*r1)), rmask & xmask, other=0)
tmp4 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp5 = tmp3 * tmp4
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
tl.store(out_ptr0 + x0, tmp3, xmask)
def get_args():
arg_0 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_108.run(*args, 104, 4, grid=grid(104), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_108.benchmark_all_configs(*args, 104, 4, grid=grid(104))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/6d/c6dicjfqwciclb5imlva4adfrsslgvcv4kzchyl4zcmedltgvjfk.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_213
# aten.native_batch_norm_backward => convert_element_type_590, convert_element_type_592, mul_805, mul_806, sub_190, sub_192, sub_193
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_109 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp16', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_109(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2609152
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 196) % 104
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp6 = tl.load(in_ptr3 + (x1), None)
tmp9 = tl.load(in_ptr4 + (x1), None)
tmp14 = tl.load(in_ptr5 + (x1), None)
tmp17 = tl.load(in_ptr6 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp2.to(tl.float32)
tmp5 = tmp3 - tmp4
tmp7 = 3.985969387755102e-05
tmp8 = tmp6 * tmp7
tmp10 = tmp9 * tmp9
tmp11 = tmp8 * tmp10
tmp12 = tmp5 * tmp11
tmp13 = tmp1 - tmp12
tmp15 = tmp14 * tmp7
tmp16 = tmp13 - tmp15
tmp18 = tmp9 * tmp17
tmp19 = tmp16 * tmp18
tmp20 = tmp19.to(tl.float32)
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp20, None)
def get_args():
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_109.run(*args, 2609152, grid=grid(2609152), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_109.benchmark_all_configs(*args, 2609152, grid=grid(2609152))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/u6/cu6gj4lj3w4o4rbwgegqtcfzcryyabv5bfn4fymj6h576kucjrrk.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_593
triton_poi_fused__to_copy_110 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_110(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16224
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_110.run(*args, 16224, grid=grid(16224), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_110.benchmark_all_configs(*args, 16224, grid=grid(16224))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/cq/ccq5bqvz5gysfuffc2onzn5q2itdz5cceokvaxh3r7x3536x7ffr.py
# Original ATen: aten.cat
# aten.cat => cat_57
triton_poi_fused_cat_111 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_111(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 7827456
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 61152
x1 = (xindex // 61152)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (122304*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 312, 14, 14), (61152, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 312, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_111.run(*args, 7827456, grid=grid(7827456), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_111.benchmark_all_configs(*args, 7827456, grid=grid(7827456))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/wo/cwolgc4dw6izbizjkqhmxp5s3e2v7ufjibf3admpysa3tcfcvbwi.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_598
triton_poi_fused__to_copy_112 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_112(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16224
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((624, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((624, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_112.run(*args, 16224, grid=grid(16224), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_112.benchmark_all_configs(*args, 16224, grid=grid(16224))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/zs/czsqkpgzlfpo7vf22txwb4vktp6li35xdtdplz23vaooelx52mgz.py
# Original ATen: aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub
# aten.add => add_342
# aten.clone => clone_23
# aten.fill => full_like_24
# aten.mul => mul_812, mul_813, mul_814
# aten.sigmoid => sigmoid_88
# aten.sub => sub_195
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_113 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_add_clone_fill_mul_sigmoid_sub_113(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3328
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = 1.0
tmp4 = tmp3 - tmp2
tmp5 = tmp1 * tmp4
tmp6 = tmp5 + tmp3
tmp7 = tmp2 * tmp6
tmp8 = tmp0 * tmp7
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp8, xmask)
def get_args():
arg_0 = rand_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_113.run(*args, 3328, grid=grid(3328), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_add_clone_fill_mul_sigmoid_sub_113.benchmark_all_configs(*args, 3328, grid=grid(3328))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/br/cbrdby4wbj6lkywp7ihnm6qs7bmx7qo2e45i7oxtqq4xbl3oog4o.py
# Original ATen: aten._to_copy, aten.convolution_backward
# aten._to_copy => convert_element_type_601
# aten.convolution_backward => sum_80
triton_per_fused__to_copy_convolution_backward_114 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[32, 128],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__to_copy_convolution_backward_114(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 26
rnumel = 128
RBLOCK: tl.constexpr = 128
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (26*r1)), rmask & xmask, other=0).to(tl.float32)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
def get_args():
arg_0 = rand_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((26,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__to_copy_convolution_backward_114.run(*args, 26, 128, grid=grid(26), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__to_copy_convolution_backward_114.benchmark_all_configs(*args, 26, 128, grid=grid(26))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/4u/c4uahgnz2ykwhxsj7bueioargms6lhe32ffh6vzmxhridx2iurdm.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_600
triton_poi_fused__to_copy_115 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_115(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16224
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((26, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((26, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_115.run(*args, 16224, grid=grid(16224), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_115.benchmark_all_configs(*args, 16224, grid=grid(16224))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ke/cke6ikrfgpb6sjssa2vwqbbb346zboallywrrkyxxcehakhefz5y.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub
# aten._native_batch_norm_legit_functional => convert_element_type_201
# aten.add => add_343, add_344
# aten.clone => clone_22
# aten.div => div_9
# aten.fill => full_like_25
# aten.mul => mul_809, mul_815, mul_816, mul_817
# aten.native_batch_norm_backward => convert_element_type_602, mul_824, sub_197, sub_199, sub_200
# aten.sigmoid => sigmoid_31, sigmoid_89
# aten.sub => sub_196
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_116 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_116(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 15654912
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x4 = (xindex // 196)
x1 = (xindex // 196) % 624
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32)
tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32)
tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32)
tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32)
tmp19 = tl.load(in_ptr5 + (x1), None)
tmp21 = tl.load(in_ptr6 + (x1), None)
tmp24 = tl.load(in_ptr7 + (x1), None)
tmp29 = tl.load(in_ptr8 + (x1), None)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp0 * tmp2
tmp5 = 196.0
tmp6 = tmp4 / tmp5
tmp7 = tmp3 + tmp6
tmp9 = tl.sigmoid(tmp8)
tmp10 = 1.0
tmp11 = tmp10 - tmp9
tmp12 = tmp8 * tmp11
tmp13 = tmp12 + tmp10
tmp14 = tmp9 * tmp13
tmp15 = tmp7 * tmp14
tmp16 = tmp15.to(tl.float32)
tmp18 = tmp17.to(tl.float32)
tmp20 = tmp18 - tmp19
tmp22 = 3.985969387755102e-05
tmp23 = tmp21 * tmp22
tmp25 = tmp24 * tmp24
tmp26 = tmp23 * tmp25
tmp27 = tmp20 * tmp26
tmp28 = tmp16 - tmp27
tmp30 = tmp29 * tmp22
tmp31 = tmp28 - tmp30
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp31, None)
def get_args():
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_4 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_5 = rand_strided((1, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_8 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_9 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_116.run(*args, 15654912, grid=grid(15654912), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_116.benchmark_all_configs(*args, 15654912, grid=grid(15654912))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/v7/cv7sl67sz2xaf4l6bv2yby2sj22h2574hqzu7lld6chl75bgzok7.py
# Original ATen: aten.convolution_backward
# aten.convolution_backward => convolution_backward_75
triton_poi_fused_convolution_backward_117 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_117(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3913728
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = (xindex // 30576)
x3 = xindex % 30576
x1 = (xindex // 196) % 156
x4 = xindex
tmp0 = tl.load(in_ptr0 + (91728 + x3 + (122304*x2)), None)
tmp1 = tl.load(in_ptr1 + (468 + x1), None)
tmp2 = tl.load(in_ptr2 + (468 + x1), None)
tmp3 = tmp1 * tmp2
tmp4 = tmp0 * tmp3
tmp5 = tmp4.to(tl.float32)
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)
def get_args():
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_convolution_backward_117.run(*args, 3913728, grid=grid(3913728), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_convolution_backward_117.benchmark_all_configs(*args, 3913728, grid=grid(3913728))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/sn/csnxk2aehntwbtuxvtyjfxxx5pcnnxto57kxv3m57bgn74rxyzdo.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_605
triton_poi_fused__to_copy_118 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_118(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 12636
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((156, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((156, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_118.run(*args, 12636, grid=grid(12636), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_118.benchmark_all_configs(*args, 12636, grid=grid(12636))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/b2/cb2zxjnunwnxsrlbgr5rnqihgpyuvgs44v74hreykirhvteg3imr.py
# Original ATen: aten.convolution_backward
# aten.convolution_backward => convolution_backward_76
triton_poi_fused_convolution_backward_119 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_119(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3913728
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = (xindex // 30576)
x3 = xindex % 30576
x1 = (xindex // 196) % 156
x4 = xindex
tmp0 = tl.load(in_ptr0 + (61152 + x3 + (122304*x2)), None)
tmp1 = tl.load(in_ptr1 + (312 + x1), None)
tmp2 = tl.load(in_ptr2 + (312 + x1), None)
tmp3 = tmp1 * tmp2
tmp4 = tmp0 * tmp3
tmp5 = tmp4.to(tl.float32)
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)
def get_args():
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_convolution_backward_119.run(*args, 3913728, grid=grid(3913728), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_convolution_backward_119.benchmark_all_configs(*args, 3913728, grid=grid(3913728))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/mq/cmqf6tn342yiwkdctvgld7rocxgxgv3azqztra5gqfjonbr5lfkj.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_606
triton_poi_fused__to_copy_120 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_120(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 7644
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((156, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((156, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_120.run(*args, 7644, grid=grid(7644), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_120.benchmark_all_configs(*args, 7644, grid=grid(7644))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/c6/cc6x33tgwrw2tamqam7mfhx5ek6rlk7b3nnl7psxxxxvdr23nguf.py
# Original ATen: aten.convolution_backward
# aten.convolution_backward => convolution_backward_77
triton_poi_fused_convolution_backward_121 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_121(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3913728
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = (xindex // 30576)
x3 = xindex % 30576
x1 = (xindex // 196) % 156
x4 = xindex
tmp0 = tl.load(in_ptr0 + (30576 + x3 + (122304*x2)), None)
tmp1 = tl.load(in_ptr1 + (156 + x1), None)
tmp2 = tl.load(in_ptr2 + (156 + x1), None)
tmp3 = tmp1 * tmp2
tmp4 = tmp0 * tmp3
tmp5 = tmp4.to(tl.float32)
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)
def get_args():
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_convolution_backward_121.run(*args, 3913728, grid=grid(3913728), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_convolution_backward_121.benchmark_all_configs(*args, 3913728, grid=grid(3913728))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/bo/cbod67uuatalyrpqcdu3oy7sbucan6ecl2ia4zk35wtdojhhxonj.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_607
triton_poi_fused__to_copy_122 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_122(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3900
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((156, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((156, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_122.run(*args, 3900, grid=grid(3900), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_122.benchmark_all_configs(*args, 3900, grid=grid(3900))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/rc/crclip5jefet3ymndwzega4qujbberu5gvtz4mjx4deqjbqd4sg3.py
# Original ATen: aten.convolution_backward
# aten.convolution_backward => convolution_backward_78
triton_poi_fused_convolution_backward_123 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_123(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3913728
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = (xindex // 30576)
x3 = xindex % 30576
x1 = (xindex // 196) % 156
x4 = xindex
tmp0 = tl.load(in_ptr0 + (x3 + (122304*x2)), None)
tmp1 = tl.load(in_ptr1 + (x1), None)
tmp2 = tl.load(in_ptr2 + (x1), None)
tmp3 = tmp1 * tmp2
tmp4 = tmp0 * tmp3
tmp5 = tmp4.to(tl.float32)
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)
def get_args():
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_convolution_backward_123.run(*args, 3913728, grid=grid(3913728), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_convolution_backward_123.benchmark_all_configs(*args, 3913728, grid=grid(3913728))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/eu/ceukfvbcwohfoz2lavfjfb2ycp454er54mrt4pq2hnohxkqujrr3.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_608
triton_poi_fused__to_copy_124 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_124(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1404
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((156, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((156, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_124.run(*args, 1404, grid=grid(1404), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_124.benchmark_all_configs(*args, 1404, grid=grid(1404))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/mj/cmjohiwxqtsa56w5lbbo3aubkq7zy3joom3fttzn7hijt3v2ims3.py
# Original ATen: aten.cat
# aten.cat => cat_58
triton_poi_fused_cat_125 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_125(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3913728
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 30576
x1 = (xindex // 30576)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (122304*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 156, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_125.run(*args, 3913728, grid=grid(3913728), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_125.benchmark_all_configs(*args, 3913728, grid=grid(3913728))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/rn/crn2e5wpsvghyj2fewhvokcjgyf5y4tcls4p5a3zgbxt3jo63qp5.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_612
triton_poi_fused__to_copy_126 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_126(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16224
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_126.run(*args, 16224, grid=grid(16224), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_126.benchmark_all_configs(*args, 16224, grid=grid(16224))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/if/cifaxtccprphcf3f7mkgsgusdq7va54imzb4ozr5t5rqz26r4xz3.py
# Original ATen: aten.cat
# aten.cat => cat_59
triton_poi_fused_cat_127 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_127(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1304576
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 10192
x1 = (xindex // 10192)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (20384*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 52, 14, 14), (10192, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 52, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_127.run(*args, 1304576, grid=grid(1304576), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_127.benchmark_all_configs(*args, 1304576, grid=grid(1304576))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/qf/cqfc2p7t5thx2bnsyx6ihjsqv65lizzzkgfndnkunfne6ahlxrbx.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_189
# aten.add => add_346
# aten.native_batch_norm_backward => convert_element_type_614, mul_839, sub_206, sum_85, sum_86
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_128 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: 'i32', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_128(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 416
rnumel = 6272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex % 104
x1 = (xindex // 104)
_tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
tmp7 = tl.load(in_ptr3 + (x0), xmask)
_tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = tl.load(in_ptr0 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tl.load(in_ptr1 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp5 = tl.load(in_ptr2 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp2 = tmp0 + tmp1
tmp3 = tmp2.to(tl.float32)
_tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4)
tmp6 = tmp5.to(tl.float32)
tmp8 = tmp6 - tmp7
tmp9 = tmp3 * tmp8
_tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10)
tmp4 = tl.sum(_tmp4, 1)[:, None]
tl.store(out_ptr0 + x3, tmp4, xmask)
tmp10 = tl.sum(_tmp10, 1)[:, None]
tl.store(out_ptr1 + x3, tmp10, xmask)
def get_args():
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_128.run(*args, 416, 6272, grid=grid(416), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_128.benchmark_all_configs(*args, 416, 6272, grid=grid(416))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/55/c55duaucwd5mqi2jcpis5psliz6udgc3zunl4l6gsysumyukuodb.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_189
# aten.add => add_346
# aten.native_batch_norm_backward => convert_element_type_614, convert_element_type_616, mul_845, mul_846, sub_206, sub_208, sub_209
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_129 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp16', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_129(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2609152
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 196) % 104
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
tmp4 = tl.load(in_ptr2 + (x3), None).to(tl.float32)
tmp6 = tl.load(in_ptr3 + (x1), None)
tmp8 = tl.load(in_ptr4 + (x1), None)
tmp11 = tl.load(in_ptr5 + (x1), None)
tmp16 = tl.load(in_ptr6 + (x1), None)
tmp19 = tl.load(in_ptr7 + (x1), None)
tmp2 = tmp0 + tmp1
tmp3 = tmp2.to(tl.float32)
tmp5 = tmp4.to(tl.float32)
tmp7 = tmp5 - tmp6
tmp9 = 3.985969387755102e-05
tmp10 = tmp8 * tmp9
tmp12 = tmp11 * tmp11
tmp13 = tmp10 * tmp12
tmp14 = tmp7 * tmp13
tmp15 = tmp3 - tmp14
tmp17 = tmp16 * tmp9
tmp18 = tmp15 - tmp17
tmp20 = tmp11 * tmp19
tmp21 = tmp18 * tmp20
tmp22 = tmp21.to(tl.float32)
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None)
def get_args():
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_8 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_129.run(*args, 2609152, grid=grid(2609152), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_129.benchmark_all_configs(*args, 2609152, grid=grid(2609152))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/gi/cgidg4l74j3mxppzfsjtpi42hq4ihs47f6epftki5i5ywvfqu6wv.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_165
# aten.add => add_346, add_351
# aten.native_batch_norm_backward => convert_element_type_638, mul_879, sub_222, sum_94, sum_95
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_130 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_130(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 416
rnumel = 6272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex % 104
x1 = (xindex // 104)
_tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
tmp9 = tl.load(in_ptr4 + (x0), xmask)
_tmp12 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = tl.load(in_ptr0 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tl.load(in_ptr1 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp3 = tl.load(in_ptr2 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp7 = tl.load(in_ptr3 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp2 = tmp0 + tmp1
tmp4 = tmp2 + tmp3
tmp5 = tmp4.to(tl.float32)
_tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6)
tmp8 = tmp7.to(tl.float32)
tmp10 = tmp8 - tmp9
tmp11 = tmp5 * tmp10
_tmp12 = tl.where(rmask & xmask, _tmp12 + tmp11, _tmp12)
tmp6 = tl.sum(_tmp6, 1)[:, None]
tl.store(out_ptr0 + x3, tmp6, xmask)
tmp12 = tl.sum(_tmp12, 1)[:, None]
tl.store(out_ptr1 + x3, tmp12, xmask)
def get_args():
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_4 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_130.run(*args, 416, 6272, grid=grid(416), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_130.benchmark_all_configs(*args, 416, 6272, grid=grid(416))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/nf/cnfybinrqgwpzwkognoekbtqzfldbno4q6jduhaywjxls5lwqbsd.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_165
# aten.add => add_346, add_351
# aten.native_batch_norm_backward => convert_element_type_638, mul_885, mul_886, sub_222, sub_224, sub_225
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_131 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_131(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2609152
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 196) % 104
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
tmp3 = tl.load(in_ptr2 + (x3), None).to(tl.float32)
tmp6 = tl.load(in_ptr3 + (x3), None).to(tl.float32)
tmp8 = tl.load(in_ptr4 + (x1), None)
tmp10 = tl.load(in_ptr5 + (x1), None)
tmp13 = tl.load(in_ptr6 + (x1), None)
tmp18 = tl.load(in_ptr7 + (x1), None)
tmp21 = tl.load(in_ptr8 + (x1), None)
tmp2 = tmp0 + tmp1
tmp4 = tmp2 + tmp3
tmp5 = tmp4.to(tl.float32)
tmp7 = tmp6.to(tl.float32)
tmp9 = tmp7 - tmp8
tmp11 = 3.985969387755102e-05
tmp12 = tmp10 * tmp11
tmp14 = tmp13 * tmp13
tmp15 = tmp12 * tmp14
tmp16 = tmp9 * tmp15
tmp17 = tmp5 - tmp16
tmp19 = tmp18 * tmp11
tmp20 = tmp17 - tmp19
tmp22 = tmp13 * tmp21
tmp23 = tmp20 * tmp22
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp23, None)
def get_args():
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_4 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_8 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_9 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_131.run(*args, 2609152, grid=grid(2609152), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_131.benchmark_all_configs(*args, 2609152, grid=grid(2609152))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/e6/ce6moc5iqgt57e7eqjuf5zhxhh5tsl3aixs4uw2xn3bhmzxi4yko.py
# Original ATen: aten.convolution_backward
# aten.convolution_backward => convolution_backward_91
triton_poi_fused_convolution_backward_132 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_132(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1304576
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 10192
x1 = (xindex // 10192)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (10192 + x0 + (20384*x1)), None)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, None)
def get_args():
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((128, 52, 14, 14), (10192, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_convolution_backward_132.run(*args, 1304576, grid=grid(1304576), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_convolution_backward_132.benchmark_all_configs(*args, 1304576, grid=grid(1304576))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/dk/cdkheqxbj4lk4wbn4mfiw2m45ufmxv76zkupgnoqujmqz7cf3flk.py
# Original ATen: aten.convolution_backward
# aten.convolution_backward => convolution_backward_92
triton_poi_fused_convolution_backward_133 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_133(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1304576
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 10192
x1 = (xindex // 10192)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (20384*x1)), None)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, None)
def get_args():
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((128, 52, 14, 14), (10192, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_convolution_backward_133.run(*args, 1304576, grid=grid(1304576), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_convolution_backward_133.benchmark_all_configs(*args, 1304576, grid=grid(1304576))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/w3/cw3zbb3x7kiu4ifygfhy3eaz6tyephtwgkxcx7rkmsrsvtrbrcix.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_141
# aten.add => add_346, add_351, add_356
# aten.native_batch_norm_backward => convert_element_type_662, mul_919, sub_238, sum_103, sum_104
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_134 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_134(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 416
rnumel = 6272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex % 104
x1 = (xindex // 104)
_tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
tmp11 = tl.load(in_ptr5 + (x0), xmask)
_tmp14 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = tl.load(in_ptr0 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tl.load(in_ptr1 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp3 = tl.load(in_ptr2 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp5 = tl.load(in_ptr3 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp9 = tl.load(in_ptr4 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp2 = tmp0 + tmp1
tmp4 = tmp2 + tmp3
tmp6 = tmp4 + tmp5
tmp7 = tmp6.to(tl.float32)
_tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8)
tmp10 = tmp9.to(tl.float32)
tmp12 = tmp10 - tmp11
tmp13 = tmp7 * tmp12
_tmp14 = tl.where(rmask & xmask, _tmp14 + tmp13, _tmp14)
tmp8 = tl.sum(_tmp8, 1)[:, None]
tl.store(out_ptr0 + x3, tmp8, xmask)
tmp14 = tl.sum(_tmp14, 1)[:, None]
tl.store(out_ptr1 + x3, tmp14, xmask)
def get_args():
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_4 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_5 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_134.run(*args, 416, 6272, grid=grid(416), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_134.benchmark_all_configs(*args, 416, 6272, grid=grid(416))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ej/cejoyk2b6v7darhrxgzp23ovzotirkownmjt7wscn7uwlctg5oxk.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.convolution_backward, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_141
# aten.add => add_346, add_351, add_356
# aten.convolution_backward => convolution_backward_101
# aten.native_batch_norm_backward => convert_element_type_662, convert_element_type_664, mul_925, mul_926, sub_238, sub_240, sub_241
triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_135 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: '*fp16', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_135(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, in_ptr9, out_ptr1, xnumel, XBLOCK : tl.constexpr):
xnumel = 2609152
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 196) % 104
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
tmp3 = tl.load(in_ptr2 + (x3), None).to(tl.float32)
tmp5 = tl.load(in_ptr3 + (x3), None).to(tl.float32)
tmp8 = tl.load(in_ptr4 + (x3), None).to(tl.float32)
tmp10 = tl.load(in_ptr5 + (x1), None)
tmp12 = tl.load(in_ptr6 + (x1), None)
tmp15 = tl.load(in_ptr7 + (x1), None)
tmp20 = tl.load(in_ptr8 + (x1), None)
tmp23 = tl.load(in_ptr9 + (x1), None)
tmp2 = tmp0 + tmp1
tmp4 = tmp2 + tmp3
tmp6 = tmp4 + tmp5
tmp7 = tmp6.to(tl.float32)
tmp9 = tmp8.to(tl.float32)
tmp11 = tmp9 - tmp10
tmp13 = 3.985969387755102e-05
tmp14 = tmp12 * tmp13
tmp16 = tmp15 * tmp15
tmp17 = tmp14 * tmp16
tmp18 = tmp11 * tmp17
tmp19 = tmp7 - tmp18
tmp21 = tmp20 * tmp13
tmp22 = tmp19 - tmp21
tmp24 = tmp15 * tmp23
tmp25 = tmp22 * tmp24
tmp26 = tmp25.to(tl.float32)
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp26, None)
def get_args():
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_4 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_5 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_8 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_9 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_10 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, arg_10,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_135.run(*args, 2609152, grid=grid(2609152), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_135.benchmark_all_configs(*args, 2609152, grid=grid(2609152))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/5h/c5hpdahtkfn4l3dhqtwm7jlr542h6muhvdgkqrxb44tec4smg4fm.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_665
triton_poi_fused__to_copy_136 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_136(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 34944
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((104, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((104, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_136.run(*args, 34944, grid=grid(34944), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_136.benchmark_all_configs(*args, 34944, grid=grid(34944))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/bc/cbch6pezjexqee4mvwdnar33u6mvs47umwhwas6d2blv7733qelg.py
# Original ATen: aten.mul, aten.sigmoid, aten.sigmoid_backward, aten.silu, aten.sum
# aten.mul => mul_928
# aten.sigmoid => sigmoid_19
# aten.sigmoid_backward => convert_element_type_666, convert_element_type_667, convert_element_type_668, mul_930, mul_931, sub_242
# aten.silu => convert_element_type_132, convert_element_type_133, mul_178, sigmoid_17
# aten.sum => sum_105
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_137 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[65536, 256],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_137(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 43008
rnumel = 196
RBLOCK: tl.constexpr = 256
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0)), rmask, other=0).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (r1 + (196*x0)), rmask, other=0).to(tl.float32)
tmp11 = tl.load(in_ptr2 + (x0), None).to(tl.float32)
tmp2 = tmp1.to(tl.float32)
tmp3 = tl.sigmoid(tmp2)
tmp4 = tmp2 * tmp3
tmp5 = tmp4.to(tl.float32)
tmp6 = tmp0 * tmp5
tmp8 = tl.where(rmask, tmp6, 0)
tmp9 = tl.sum(tmp8, 1)[:, None]
tmp10 = tmp9.to(tl.float32)
tmp12 = tl.sigmoid(tmp11)
tmp13 = tmp12.to(tl.float32)
tmp14 = 1.0
tmp15 = tmp14 - tmp13
tmp16 = tmp13 * tmp15
tmp17 = tmp10 * tmp16
tmp18 = tmp17.to(tl.float32)
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp18, None)
def get_args():
arg_0 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_137.run(*args, 43008, 196, grid=grid(43008), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_137.benchmark_all_configs(*args, 43008, 196, grid=grid(43008))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/lw/clwggjp2glb6lt7ovt2lc3pzpsocchmesscnc7d3a6zsnanmt43m.py
# Original ATen: aten._to_copy, aten.convolution_backward
# aten._to_copy => convert_element_type_670
# aten.convolution_backward => sum_106
triton_per_fused__to_copy_convolution_backward_138 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[512, 128],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__to_copy_convolution_backward_138(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 336
rnumel = 128
RBLOCK: tl.constexpr = 128
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (336*r1)), rmask & xmask, other=0).to(tl.float32)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
def get_args():
arg_0 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__to_copy_convolution_backward_138.run(*args, 336, 128, grid=grid(336), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__to_copy_convolution_backward_138.benchmark_all_configs(*args, 336, 128, grid=grid(336))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ny/cnyioznughbctzhfqmivqbazouomcdvadukxtk2rxq6q22lk25nc.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_669
triton_poi_fused__to_copy_139 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_139(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4704
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((336, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((336, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_139.run(*args, 4704, grid=grid(4704), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_139.benchmark_all_configs(*args, 4704, grid=grid(4704))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/xy/cxy6onpt27zwbftqiiz3rmsdtwpb6ifqj55hnehr2kljkwmm2una.py
# Original ATen: aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub
# aten.add => add_357
# aten.clone => clone_14
# aten.fill => full_like_33
# aten.mul => mul_932, mul_933, mul_934
# aten.sigmoid => sigmoid_97
# aten.sub => sub_243
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_140 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_add_clone_fill_mul_sigmoid_sub_140(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1792
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = 1.0
tmp4 = tmp3 - tmp2
tmp5 = tmp1 * tmp4
tmp6 = tmp5 + tmp3
tmp7 = tmp2 * tmp6
tmp8 = tmp0 * tmp7
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp8, xmask)
def get_args():
arg_0 = rand_strided((128, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_140.run(*args, 1792, grid=grid(1792), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_add_clone_fill_mul_sigmoid_sub_140.benchmark_all_configs(*args, 1792, grid=grid(1792))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/af/cafrvpfoaadhdlxhldphg5leeyz2enwfb64ifixaudfchrcwkgns.py
# Original ATen: aten._to_copy, aten.convolution_backward
# aten._to_copy => convert_element_type_672
# aten.convolution_backward => sum_107
triton_per_fused__to_copy_convolution_backward_141 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[16, 128],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__to_copy_convolution_backward_141(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 14
rnumel = 128
RBLOCK: tl.constexpr = 128
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (14*r1)), rmask & xmask, other=0).to(tl.float32)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
def get_args():
arg_0 = rand_strided((128, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((14,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__to_copy_convolution_backward_141.run(*args, 14, 128, grid=grid(14), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__to_copy_convolution_backward_141.benchmark_all_configs(*args, 14, 128, grid=grid(14))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ub/cubt2wopxldea56edxrzorl54nnvgwjendwqgv23wj6it7xjbynx.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_671
triton_poi_fused__to_copy_142 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_142(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4704
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((14, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((14, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_142.run(*args, 4704, grid=grid(4704), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_142.benchmark_all_configs(*args, 4704, grid=grid(4704))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/pw/cpwcwf6uhmujsg7jk3lvfx67kggnjin7oj2locjyrnecqe2jebap.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub
# aten._native_batch_norm_legit_functional => convert_element_type_130
# aten.add => add_358, add_359
# aten.clone => clone_13
# aten.div => div_12
# aten.fill => full_like_34
# aten.mul => mul_929, mul_935, mul_936, mul_937
# aten.native_batch_norm_backward => convert_element_type_673, mul_938, mul_946, sub_245, sum_108, sum_109
# aten.sigmoid => sigmoid_19, sigmoid_98
# aten.sub => sub_244
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_143 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 32768],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_143(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 336
rnumel = 25088
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
tmp20 = tl.load(in_ptr5 + (x0), xmask)
_tmp23 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 196
r2 = (rindex // 196)
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (65856*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (x0 + (336*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp4 = tl.load(in_ptr2 + (x0 + (336*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp8 = tl.load(in_ptr3 + (r1 + (196*x0) + (65856*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp18 = tl.load(in_ptr4 + (r1 + (196*x0) + (65856*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp0 * tmp2
tmp5 = 196.0
tmp6 = tmp4 / tmp5
tmp7 = tmp3 + tmp6
tmp9 = tl.sigmoid(tmp8)
tmp10 = 1.0
tmp11 = tmp10 - tmp9
tmp12 = tmp8 * tmp11
tmp13 = tmp12 + tmp10
tmp14 = tmp9 * tmp13
tmp15 = tmp7 * tmp14
tmp16 = tmp15.to(tl.float32)
_tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17)
tmp19 = tmp18.to(tl.float32)
tmp21 = tmp19 - tmp20
tmp22 = tmp16 * tmp21
_tmp23 = tl.where(rmask & xmask, _tmp23 + tmp22, _tmp23)
tmp17 = tl.sum(_tmp17, 1)[:, None]
tl.store(out_ptr0 + x0, tmp17, xmask)
tmp23 = tl.sum(_tmp23, 1)[:, None]
tl.store(out_ptr1 + x0, tmp23, xmask)
tmp24 = tl.load(in_ptr6 + (x0), xmask)
tmp25 = tmp23 * tmp24
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp25, xmask)
def get_args():
arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_4 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_5 = rand_strided((1, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_8 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_9 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_143.run(*args, 336, 25088, grid=grid(336), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_143.benchmark_all_configs(*args, 336, 25088, grid=grid(336))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/7p/c7psulr34da3bohr63ebglp7ja7h35fmmewyuanlwy56v67tyr3o.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub
# aten._native_batch_norm_legit_functional => convert_element_type_130
# aten.add => add_358, add_359
# aten.clone => clone_13
# aten.div => div_12
# aten.fill => full_like_34
# aten.mul => mul_929, mul_935, mul_936, mul_937
# aten.native_batch_norm_backward => convert_element_type_673, mul_944, sub_245, sub_247, sub_248
# aten.sigmoid => sigmoid_19, sigmoid_98
# aten.sub => sub_244
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_144 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_144(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 8429568
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x4 = (xindex // 196)
x1 = (xindex // 196) % 336
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32)
tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32)
tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32)
tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32)
tmp19 = tl.load(in_ptr5 + (x1), None)
tmp21 = tl.load(in_ptr6 + (x1), None)
tmp24 = tl.load(in_ptr7 + (x1), None)
tmp29 = tl.load(in_ptr8 + (x1), None)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp0 * tmp2
tmp5 = 196.0
tmp6 = tmp4 / tmp5
tmp7 = tmp3 + tmp6
tmp9 = tl.sigmoid(tmp8)
tmp10 = 1.0
tmp11 = tmp10 - tmp9
tmp12 = tmp8 * tmp11
tmp13 = tmp12 + tmp10
tmp14 = tmp9 * tmp13
tmp15 = tmp7 * tmp14
tmp16 = tmp15.to(tl.float32)
tmp18 = tmp17.to(tl.float32)
tmp20 = tmp18 - tmp19
tmp22 = 3.985969387755102e-05
tmp23 = tmp21 * tmp22
tmp25 = tmp24 * tmp24
tmp26 = tmp23 * tmp25
tmp27 = tmp20 * tmp26
tmp28 = tmp16 - tmp27
tmp30 = tmp29 * tmp22
tmp31 = tmp28 - tmp30
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp31, None)
def get_args():
arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_4 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_5 = rand_strided((1, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_8 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_9 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_144.run(*args, 8429568, grid=grid(8429568), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_144.benchmark_all_configs(*args, 8429568, grid=grid(8429568))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/id/cidvqsl7tausuxdym44zowgtyle4qgcqwkmvhutx6ld6q33q6idp.py
# Original ATen: aten.convolution_backward
# aten.convolution_backward => convolution_backward_104
triton_poi_fused_convolution_backward_145 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_145(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2809856
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = (xindex // 21952)
x3 = xindex % 21952
x1 = (xindex // 196) % 112
x4 = xindex
tmp0 = tl.load(in_ptr0 + (43904 + x3 + (65856*x2)), None)
tmp1 = tl.load(in_ptr1 + (224 + x1), None)
tmp2 = tl.load(in_ptr2 + (224 + x1), None)
tmp3 = tmp1 * tmp2
tmp4 = tmp0 * tmp3
tmp5 = tmp4.to(tl.float32)
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)
def get_args():
arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((128, 112, 14, 14), (21952, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_convolution_backward_145.run(*args, 2809856, grid=grid(2809856), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_convolution_backward_145.benchmark_all_configs(*args, 2809856, grid=grid(2809856))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/fd/cfdopkndut6mz6wq6bvbwuuulhnrmhdrv5rzyqs6re2kufkzwbyi.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_676
triton_poi_fused__to_copy_146 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_146(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 5488
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((112, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((112, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_146.run(*args, 5488, grid=grid(5488), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_146.benchmark_all_configs(*args, 5488, grid=grid(5488))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/th/cthuea2iprfudqgjdcjb4l6mtcfp43ogq3yml625vwzkim7u4s2l.py
# Original ATen: aten.convolution_backward
# aten.convolution_backward => convolution_backward_105
triton_poi_fused_convolution_backward_147 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_147(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2809856
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = (xindex // 21952)
x3 = xindex % 21952
x1 = (xindex // 196) % 112
x4 = xindex
tmp0 = tl.load(in_ptr0 + (21952 + x3 + (65856*x2)), None)
tmp1 = tl.load(in_ptr1 + (112 + x1), None)
tmp2 = tl.load(in_ptr2 + (112 + x1), None)
tmp3 = tmp1 * tmp2
tmp4 = tmp0 * tmp3
tmp5 = tmp4.to(tl.float32)
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)
def get_args():
arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((128, 112, 14, 14), (21952, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_convolution_backward_147.run(*args, 2809856, grid=grid(2809856), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_convolution_backward_147.benchmark_all_configs(*args, 2809856, grid=grid(2809856))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/3x/c3xs7plvafsbnfj7oqnbllamz4ftcowca37hxufdgln6b4kvegly.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_677
triton_poi_fused__to_copy_148 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_148(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2800
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((112, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((112, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_148.run(*args, 2800, grid=grid(2800), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_148.benchmark_all_configs(*args, 2800, grid=grid(2800))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/zr/czrwimvnigaum3atz3dnqh3vpqefjj27el4vvkm52xud7pku2qdu.py
# Original ATen: aten.convolution_backward
# aten.convolution_backward => convolution_backward_106
triton_poi_fused_convolution_backward_149 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_149(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2809856
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = (xindex // 21952)
x3 = xindex % 21952
x1 = (xindex // 196) % 112
x4 = xindex
tmp0 = tl.load(in_ptr0 + (x3 + (65856*x2)), None)
tmp1 = tl.load(in_ptr1 + (x1), None)
tmp2 = tl.load(in_ptr2 + (x1), None)
tmp3 = tmp1 * tmp2
tmp4 = tmp0 * tmp3
tmp5 = tmp4.to(tl.float32)
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)
def get_args():
arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((128, 112, 14, 14), (21952, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_convolution_backward_149.run(*args, 2809856, grid=grid(2809856), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_convolution_backward_149.benchmark_all_configs(*args, 2809856, grid=grid(2809856))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/2v/c2volwa6uwcetttnruu5jjo66mjrobvup6mcg6k7n4aknrx3iggt.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_678
triton_poi_fused__to_copy_150 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_150(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1008
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((112, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((112, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_150.run(*args, 1008, grid=grid(1008), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_150.benchmark_all_configs(*args, 1008, grid=grid(1008))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/we/cwefssxf5bvvy2y5bt7gv2qzizpcib7seqbqn2voklitwa4hshmd.py
# Original ATen: aten.cat
# aten.cat => cat_66
triton_poi_fused_cat_151 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_151(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 11239424
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 87808
x1 = (xindex // 87808)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (263424*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 112, 28, 28), (87808, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 112, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_151.run(*args, 11239424, grid=grid(11239424), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_151.benchmark_all_configs(*args, 11239424, grid=grid(11239424))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/qf/cqfo4ck6nf3trupktl4ixfnhezf4tcvnlhrfn63fekgvbdyrr6da.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mul, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_123
# aten.mul => mul_949
# aten.native_batch_norm_backward => convert_element_type_679, mul_950, mul_958, sub_250, sum_110, sum_111
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_152 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 131072],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_152(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 336
rnumel = 100352
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
tmp7 = tl.load(in_ptr3 + (x0), xmask)
_tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 784
r2 = (rindex // 784)
tmp0 = tl.load(in_ptr0 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp5 = tl.load(in_ptr2 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp2 = tmp0 * tmp1
tmp3 = tmp2.to(tl.float32)
_tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4)
tmp6 = tmp5.to(tl.float32)
tmp8 = tmp6 - tmp7
tmp9 = tmp3 * tmp8
_tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10)
tmp4 = tl.sum(_tmp4, 1)[:, None]
tl.store(out_ptr0 + x0, tmp4, xmask)
tmp10 = tl.sum(_tmp10, 1)[:, None]
tl.store(out_ptr1 + x0, tmp10, xmask)
tmp11 = tl.load(in_ptr4 + (x0), xmask)
tmp12 = tmp10 * tmp11
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp12, xmask)
def get_args():
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((1, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_152.run(*args, 336, 100352, grid=grid(336), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_152.benchmark_all_configs(*args, 336, 100352, grid=grid(336))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/qc/cqcwwcxya4t45mgfdz46w2lkm3l3dt6ws7mawrdyrpvgbmqxgblv.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.convolution_backward, aten.mul, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_123
# aten.convolution_backward => convolution_backward_107
# aten.mul => mul_949
# aten.native_batch_norm_backward => convert_element_type_679, convert_element_type_681, mul_956, mul_957, sub_250, sub_252, sub_253
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_153 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_153(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr):
xnumel = 33718272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 784) % 336
tmp0 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp4 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
tmp6 = tl.load(in_ptr2 + (x1), None)
tmp8 = tl.load(in_ptr3 + (x1), None)
tmp11 = tl.load(in_ptr4 + (x1), None)
tmp16 = tl.load(in_ptr5 + (x1), None)
tmp19 = tl.load(in_ptr6 + (x1), None)
tmp2 = tmp0 * tmp1
tmp3 = tmp2.to(tl.float32)
tmp5 = tmp4.to(tl.float32)
tmp7 = tmp5 - tmp6
tmp9 = 9.964923469387754e-06
tmp10 = tmp8 * tmp9
tmp12 = tmp11 * tmp11
tmp13 = tmp10 * tmp12
tmp14 = tmp7 * tmp13
tmp15 = tmp3 - tmp14
tmp17 = tmp16 * tmp9
tmp18 = tmp15 - tmp17
tmp20 = tmp11 * tmp19
tmp21 = tmp18 * tmp20
tmp22 = tmp21.to(tl.float32)
tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None)
def get_args():
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((1, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_153.run(*args, 33718272, grid=grid(33718272), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_153.benchmark_all_configs(*args, 33718272, grid=grid(33718272))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/xp/cxpyb2d7tgj3ajjcfk5ruyhc5u2uszkeh76gef4njgyf5kz5ukth.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_682
triton_poi_fused__to_copy_154 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_154(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 18816
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((336, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((336, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_154.run(*args, 18816, grid=grid(18816), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_154.benchmark_all_configs(*args, 18816, grid=grid(18816))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ku/cku7ek7gtwfrqcc34nfk6i7wvwqtzc57tln743w3jpx2qsiyexs7.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_120
# aten.native_batch_norm_backward => convert_element_type_683, mul_959, sub_254, sum_112, sum_113
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_155 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_155(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 728
rnumel = 7720
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x1 = (xindex // 56)
x0 = xindex % 56
_tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
_tmp13 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = r2 + (7720*x1)
tmp1 = 100352
tmp2 = tmp0 < tmp1
tmp3 = tl.load(in_ptr0 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp4 = tmp3.to(tl.float32)
tmp5 = tl.where(tmp2, tmp4, 0)
_tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6)
tmp7 = tl.load(in_ptr1 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp8 = tmp7.to(tl.float32)
tmp9 = tl.load(in_ptr2 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0)
tmp10 = tmp8 - tmp9
tmp11 = tmp4 * tmp10
tmp12 = tl.where(tmp2, tmp11, 0)
_tmp13 = tl.where(rmask & xmask, _tmp13 + tmp12, _tmp13)
tmp6 = tl.sum(_tmp6, 1)[:, None]
tl.store(out_ptr0 + x3, tmp6, xmask)
tmp13 = tl.sum(_tmp13, 1)[:, None]
tl.store(out_ptr1 + x3, tmp13, xmask)
def get_args():
arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((1, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_155.run(*args, 728, 7720, grid=grid(728), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_155.benchmark_all_configs(*args, 728, 7720, grid=grid(728))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/uy/cuyabiwct4xwna22y54u6ef6hgcysgudgvw53xgcxhoetvhdud3c.py
# Original ATen: aten.native_batch_norm_backward
# aten.native_batch_norm_backward => convert_element_type_683, sum_112
triton_per_fused_native_batch_norm_backward_156 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[64, 16],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}
)
@triton.jit
def triton_per_fused_native_batch_norm_backward_156(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 56
rnumel = 13
RBLOCK: tl.constexpr = 16
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (56*r1)), rmask & xmask, other=0)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tl.store(out_ptr0 + x0, tmp3, xmask)
def get_args():
arg_0 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused_native_batch_norm_backward_156.run(*args, 56, 13, grid=grid(56), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused_native_batch_norm_backward_156.benchmark_all_configs(*args, 56, 13, grid=grid(56))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/on/convc5p74ddudqk46e52onvmx3rxjfktebgzjuhxneits4eo7nyc.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_120
# aten.native_batch_norm_backward => convert_element_type_683, mul_959, mul_967, sub_254, sum_113
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_157 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[64, 16],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_157(in_ptr0, in_ptr1, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 56
rnumel = 13
RBLOCK: tl.constexpr = 16
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (56*r1)), rmask & xmask, other=0)
tmp4 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp5 = tmp3 * tmp4
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
tl.store(out_ptr0 + x0, tmp3, xmask)
def get_args():
arg_0 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_157.run(*args, 56, 13, grid=grid(56), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_157.benchmark_all_configs(*args, 56, 13, grid=grid(56))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/bj/cbjvyothjvn62eovj2qvbnhqdnxtnyarsgyu3rshwon5wrxfm6me.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_120
# aten.native_batch_norm_backward => convert_element_type_683, convert_element_type_685, mul_965, mul_966, sub_254, sub_256, sub_257
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_158 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp16', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_158(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 5619712
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 784) % 56
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp6 = tl.load(in_ptr3 + (x1), None)
tmp9 = tl.load(in_ptr4 + (x1), None)
tmp14 = tl.load(in_ptr5 + (x1), None)
tmp17 = tl.load(in_ptr6 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp2.to(tl.float32)
tmp5 = tmp3 - tmp4
tmp7 = 9.964923469387754e-06
tmp8 = tmp6 * tmp7
tmp10 = tmp9 * tmp9
tmp11 = tmp8 * tmp10
tmp12 = tmp5 * tmp11
tmp13 = tmp1 - tmp12
tmp15 = tmp14 * tmp7
tmp16 = tmp13 - tmp15
tmp18 = tmp9 * tmp17
tmp19 = tmp16 * tmp18
tmp20 = tmp19.to(tl.float32)
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp20, None)
def get_args():
arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((1, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_158.run(*args, 5619712, grid=grid(5619712), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_158.benchmark_all_configs(*args, 5619712, grid=grid(5619712))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ip/cipdkjyk2onxl3m3jynzjqksmm63qjqd3qbbbq47kgdhlig5ktev.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_686
triton_poi_fused__to_copy_159 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_159(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4704
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_159.run(*args, 4704, grid=grid(4704), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_159.benchmark_all_configs(*args, 4704, grid=grid(4704))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/hk/chko7h4vzts7m5h2r3fcixp646xti7wwu6pppgto4mg3xqc3ieno.py
# Original ATen: aten.cat
# aten.cat => cat_67
triton_poi_fused_cat_160 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_160(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16859136
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 131712
x1 = (xindex // 131712)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (263424*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 168, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_160.run(*args, 16859136, grid=grid(16859136), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_160.benchmark_all_configs(*args, 16859136, grid=grid(16859136))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/wf/cwfwcfiqllapezhy4wsh7mb3i7a2codkip2cnbeie26v347spvpq.py
# Original ATen: aten.mul, aten.sigmoid, aten.sigmoid_backward, aten.silu, aten.sum
# aten.mul => mul_968
# aten.sigmoid => sigmoid_15
# aten.sigmoid_backward => convert_element_type_688, convert_element_type_689, convert_element_type_690, mul_970, mul_971, sub_258
# aten.silu => convert_element_type_110, convert_element_type_111, mul_153, sigmoid_13
# aten.sum => sum_114
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_161 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[65536, 1024],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]}
)
@triton.jit
def triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_161(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 43008
rnumel = 784
RBLOCK: tl.constexpr = 1024
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (r1 + (784*x0)), rmask, other=0).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (r1 + (784*x0)), rmask, other=0).to(tl.float32)
tmp11 = tl.load(in_ptr2 + (x0), None).to(tl.float32)
tmp2 = tmp1.to(tl.float32)
tmp3 = tl.sigmoid(tmp2)
tmp4 = tmp2 * tmp3
tmp5 = tmp4.to(tl.float32)
tmp6 = tmp0 * tmp5
tmp8 = tl.where(rmask, tmp6, 0)
tmp9 = tl.sum(tmp8, 1)[:, None]
tmp10 = tmp9.to(tl.float32)
tmp12 = tl.sigmoid(tmp11)
tmp13 = tmp12.to(tl.float32)
tmp14 = 1.0
tmp15 = tmp14 - tmp13
tmp16 = tmp13 * tmp15
tmp17 = tmp10 * tmp16
tmp18 = tmp17.to(tl.float32)
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp18, None)
def get_args():
arg_0 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_161.run(*args, 43008, 784, grid=grid(43008), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_161.benchmark_all_configs(*args, 43008, 784, grid=grid(43008))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/h6/ch66mav6cpspgg34bc2ckyogaizco43c7rwo2n4rkshr5666vpid.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_691
triton_poi_fused__to_copy_162 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_162(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 9408
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((336, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((336, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_162.run(*args, 9408, grid=grid(9408), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_162.benchmark_all_configs(*args, 9408, grid=grid(9408))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/fi/cfibawhd6jznqxl7q2atcugp7zx43kt5jtkntbghy42ljr2ytlun.py
# Original ATen: aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub
# aten.add => add_361
# aten.clone => clone_11
# aten.fill => full_like_36
# aten.mul => mul_972, mul_973, mul_974
# aten.sigmoid => sigmoid_100
# aten.sub => sub_259
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_163 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_add_clone_fill_mul_sigmoid_sub_163(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3584
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = 1.0
tmp4 = tmp3 - tmp2
tmp5 = tmp1 * tmp4
tmp6 = tmp5 + tmp3
tmp7 = tmp2 * tmp6
tmp8 = tmp0 * tmp7
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp8, xmask)
def get_args():
arg_0 = rand_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_163.run(*args, 3584, grid=grid(3584), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_add_clone_fill_mul_sigmoid_sub_163.benchmark_all_configs(*args, 3584, grid=grid(3584))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/yv/cyvint5v6fejjcxz54v5axwxdzytfpjmq7eoqsppyu45zsqim3l2.py
# Original ATen: aten._to_copy, aten.convolution_backward
# aten._to_copy => convert_element_type_694
# aten.convolution_backward => sum_116
triton_per_fused__to_copy_convolution_backward_164 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[32, 128],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__to_copy_convolution_backward_164(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 28
rnumel = 128
RBLOCK: tl.constexpr = 128
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (28*r1)), rmask & xmask, other=0).to(tl.float32)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
def get_args():
arg_0 = rand_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((28,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__to_copy_convolution_backward_164.run(*args, 28, 128, grid=grid(28), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__to_copy_convolution_backward_164.benchmark_all_configs(*args, 28, 128, grid=grid(28))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/pk/cpkfpj7moawggcqkv5ok3cwxbz2nv6myxthuduj53xnw6qw3dwig.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_693
triton_poi_fused__to_copy_165 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_165(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 9408
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((28, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((28, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_165.run(*args, 9408, grid=grid(9408), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_165.benchmark_all_configs(*args, 9408, grid=grid(9408))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/q7/cq7zl54kvzo64w2eq2dfuf5wrsv6grvd6fo3lgq77a5kcc7cs6ek.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub
# aten._native_batch_norm_legit_functional => convert_element_type_108
# aten.add => add_362, add_363
# aten.clone => clone_10
# aten.div => div_13
# aten.fill => full_like_37
# aten.mul => mul_969, mul_975, mul_976, mul_977
# aten.native_batch_norm_backward => convert_element_type_695, mul_978, mul_986, sub_261, sum_117, sum_118
# aten.sigmoid => sigmoid_101, sigmoid_15
# aten.sub => sub_260
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_166 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 131072],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_166(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 336
rnumel = 100352
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
tmp20 = tl.load(in_ptr5 + (x0), xmask)
_tmp23 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 784
r2 = (rindex // 784)
tmp0 = tl.load(in_ptr0 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (x0 + (336*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp4 = tl.load(in_ptr2 + (x0 + (336*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp8 = tl.load(in_ptr3 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp18 = tl.load(in_ptr4 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp0 * tmp2
tmp5 = 784.0
tmp6 = tmp4 / tmp5
tmp7 = tmp3 + tmp6
tmp9 = tl.sigmoid(tmp8)
tmp10 = 1.0
tmp11 = tmp10 - tmp9
tmp12 = tmp8 * tmp11
tmp13 = tmp12 + tmp10
tmp14 = tmp9 * tmp13
tmp15 = tmp7 * tmp14
tmp16 = tmp15.to(tl.float32)
_tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17)
tmp19 = tmp18.to(tl.float32)
tmp21 = tmp19 - tmp20
tmp22 = tmp16 * tmp21
_tmp23 = tl.where(rmask & xmask, _tmp23 + tmp22, _tmp23)
tmp17 = tl.sum(_tmp17, 1)[:, None]
tl.store(out_ptr0 + x0, tmp17, xmask)
tmp23 = tl.sum(_tmp23, 1)[:, None]
tl.store(out_ptr1 + x0, tmp23, xmask)
tmp24 = tl.load(in_ptr6 + (x0), xmask)
tmp25 = tmp23 * tmp24
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp25, xmask)
def get_args():
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_4 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_5 = rand_strided((1, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_8 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_9 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_166.run(*args, 336, 100352, grid=grid(336), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_166.benchmark_all_configs(*args, 336, 100352, grid=grid(336))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/rc/crcc3il2qglpofbslbuwlevrp5dihafxcr4wxlvjfxaasqpi56hu.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub
# aten._native_batch_norm_legit_functional => convert_element_type_108
# aten.add => add_362, add_363
# aten.clone => clone_10
# aten.div => div_13
# aten.fill => full_like_37
# aten.mul => mul_969, mul_975, mul_976, mul_977
# aten.native_batch_norm_backward => convert_element_type_695, mul_984, sub_261, sub_263, sub_264
# aten.sigmoid => sigmoid_101, sigmoid_15
# aten.sub => sub_260
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_167 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_167(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 33718272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x4 = (xindex // 784)
x1 = (xindex // 784) % 336
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32)
tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32)
tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32)
tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32)
tmp19 = tl.load(in_ptr5 + (x1), None)
tmp21 = tl.load(in_ptr6 + (x1), None)
tmp24 = tl.load(in_ptr7 + (x1), None)
tmp29 = tl.load(in_ptr8 + (x1), None)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp0 * tmp2
tmp5 = 784.0
tmp6 = tmp4 / tmp5
tmp7 = tmp3 + tmp6
tmp9 = tl.sigmoid(tmp8)
tmp10 = 1.0
tmp11 = tmp10 - tmp9
tmp12 = tmp8 * tmp11
tmp13 = tmp12 + tmp10
tmp14 = tmp9 * tmp13
tmp15 = tmp7 * tmp14
tmp16 = tmp15.to(tl.float32)
tmp18 = tmp17.to(tl.float32)
tmp20 = tmp18 - tmp19
tmp22 = 9.964923469387754e-06
tmp23 = tmp21 * tmp22
tmp25 = tmp24 * tmp24
tmp26 = tmp23 * tmp25
tmp27 = tmp20 * tmp26
tmp28 = tmp16 - tmp27
tmp30 = tmp29 * tmp22
tmp31 = tmp28 - tmp30
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp31, None)
def get_args():
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_4 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_5 = rand_strided((1, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_8 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_9 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_167.run(*args, 33718272, grid=grid(33718272), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_167.benchmark_all_configs(*args, 33718272, grid=grid(33718272))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/7y/c7y3vo54x6g4hqnvyjz4k3ngokc66wmd3u7zsbjrmhorsy53o4sb.py
# Original ATen: aten.convolution_backward
# aten.convolution_backward => convolution_backward_112
triton_poi_fused_convolution_backward_168 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_168(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16859136
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = (xindex // 131712)
x3 = xindex % 131712
x1 = (xindex // 784) % 168
x4 = xindex
tmp0 = tl.load(in_ptr0 + (131712 + x3 + (263424*x2)), None)
tmp1 = tl.load(in_ptr1 + (168 + x1), None)
tmp2 = tl.load(in_ptr2 + (168 + x1), None)
tmp3 = tmp1 * tmp2
tmp4 = tmp0 * tmp3
tmp5 = tmp4.to(tl.float32)
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)
def get_args():
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_convolution_backward_168.run(*args, 16859136, grid=grid(16859136), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_convolution_backward_168.benchmark_all_configs(*args, 16859136, grid=grid(16859136))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/6n/c6nredjp54hkor7tbo72xhpqqat3vpewekmkr4kkyypw6cgqm6ai.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_698
triton_poi_fused__to_copy_169 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_169(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4200
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((168, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((168, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_169.run(*args, 4200, grid=grid(4200), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_169.benchmark_all_configs(*args, 4200, grid=grid(4200))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ia/ciabi2inw7dvbjd5nathz3xfqaodfqjylvjwgsdeimxatolwwqks.py
# Original ATen: aten.convolution_backward
# aten.convolution_backward => convolution_backward_113
triton_poi_fused_convolution_backward_170 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_170(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16859136
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = (xindex // 131712)
x3 = xindex % 131712
x1 = (xindex // 784) % 168
x4 = xindex
tmp0 = tl.load(in_ptr0 + (x3 + (263424*x2)), None)
tmp1 = tl.load(in_ptr1 + (x1), None)
tmp2 = tl.load(in_ptr2 + (x1), None)
tmp3 = tmp1 * tmp2
tmp4 = tmp0 * tmp3
tmp5 = tmp4.to(tl.float32)
tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)
def get_args():
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_convolution_backward_170.run(*args, 16859136, grid=grid(16859136), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_convolution_backward_170.benchmark_all_configs(*args, 16859136, grid=grid(16859136))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/c4/cc4n2v6wqwhmmrx6e37v63wrmsqntez34qcciaujxy6fxsa2gk3o.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_699
triton_poi_fused__to_copy_171 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_171(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1512
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((168, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((168, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_171.run(*args, 1512, grid=grid(1512), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_171.benchmark_all_configs(*args, 1512, grid=grid(1512))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ak/cakbaw2uk6hhpcroelbhhfo57dmqlxwcgkogm2m6ds3sv57t2vqz.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_703
triton_poi_fused__to_copy_172 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_172(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4704
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_172.run(*args, 4704, grid=grid(4704), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_172.benchmark_all_configs(*args, 4704, grid=grid(4704))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ow/cowu2kdmjh4mnuy3o2jt5hyxyjhq6tnzxq4b3gah2vdhgpih4okk.py
# Original ATen: aten.cat
# aten.cat => cat_69
triton_poi_fused_cat_173 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_173(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2809856
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 21952
x1 = (xindex // 21952)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (43904*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 28, 28, 28), (21952, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 28, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_173.run(*args, 2809856, grid=grid(2809856), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_173.benchmark_all_configs(*args, 2809856, grid=grid(2809856))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/tu/ctuwkh7mmjcoxpqhlozf53j6kx7nu6pcnpqqmwor4d57vrqdsosn.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_98
# aten.add => add_365
# aten.native_batch_norm_backward => convert_element_type_705, mul_999, sub_270, sum_121, sum_122
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_174 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: 'i32', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_174(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 728
rnumel = 7720
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x1 = (xindex // 56)
x0 = xindex % 56
_tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = r2 + (7720*x1)
tmp1 = 100352
tmp2 = tmp0 < tmp1
tmp3 = tl.load(in_ptr0 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp4 = tl.load(in_ptr1 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp5 = tmp3 + tmp4
tmp6 = tmp5.to(tl.float32)
tmp7 = tl.where(tmp2, tmp6, 0)
_tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8)
tmp9 = tl.load(in_ptr2 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp10 = tmp9.to(tl.float32)
tmp11 = tl.load(in_ptr3 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0)
tmp12 = tmp10 - tmp11
tmp13 = tmp6 * tmp12
tmp14 = tl.where(tmp2, tmp13, 0)
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
tmp8 = tl.sum(_tmp8, 1)[:, None]
tl.store(out_ptr0 + x3, tmp8, xmask)
tmp15 = tl.sum(_tmp15, 1)[:, None]
tl.store(out_ptr1 + x3, tmp15, xmask)
def get_args():
arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((1, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_174.run(*args, 728, 7720, grid=grid(728), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_174.benchmark_all_configs(*args, 728, 7720, grid=grid(728))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/3q/c3qu2jno53qez6kspx5ta57fq4vavwnpqfdqbctsz2h5gfllev2o.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_98
# aten.add => add_365
# aten.native_batch_norm_backward => convert_element_type_705, convert_element_type_707, mul_1005, mul_1006, sub_270, sub_272, sub_273
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_175 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp16', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_175(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 5619712
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 784) % 56
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
tmp4 = tl.load(in_ptr2 + (x3), None).to(tl.float32)
tmp6 = tl.load(in_ptr3 + (x1), None)
tmp8 = tl.load(in_ptr4 + (x1), None)
tmp11 = tl.load(in_ptr5 + (x1), None)
tmp16 = tl.load(in_ptr6 + (x1), None)
tmp19 = tl.load(in_ptr7 + (x1), None)
tmp2 = tmp0 + tmp1
tmp3 = tmp2.to(tl.float32)
tmp5 = tmp4.to(tl.float32)
tmp7 = tmp5 - tmp6
tmp9 = 9.964923469387754e-06
tmp10 = tmp8 * tmp9
tmp12 = tmp11 * tmp11
tmp13 = tmp10 * tmp12
tmp14 = tmp7 * tmp13
tmp15 = tmp3 - tmp14
tmp17 = tmp16 * tmp9
tmp18 = tmp15 - tmp17
tmp20 = tmp11 * tmp19
tmp21 = tmp18 * tmp20
tmp22 = tmp21.to(tl.float32)
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None)
def get_args():
arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((1, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
arg_8 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_175.run(*args, 5619712, grid=grid(5619712), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_175.benchmark_all_configs(*args, 5619712, grid=grid(5619712))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/he/cheuhrmgcxe3wl2kq52bt7crzek5t4ji6b56jrybsdffzrqao7el.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_76
# aten.add => add_365, add_370
# aten.native_batch_norm_backward => convert_element_type_727, mul_1039, sub_286, sum_130, sum_131
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_176 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_176(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 728
rnumel = 7720
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x1 = (xindex // 56)
x0 = xindex % 56
_tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
_tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = r2 + (7720*x1)
tmp1 = 100352
tmp2 = tmp0 < tmp1
tmp3 = tl.load(in_ptr0 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp4 = tl.load(in_ptr1 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp5 = tmp3 + tmp4
tmp6 = tl.load(in_ptr2 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp7 = tmp5 + tmp6
tmp8 = tmp7.to(tl.float32)
tmp9 = tl.where(tmp2, tmp8, 0)
_tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10)
tmp11 = tl.load(in_ptr3 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp12 = tmp11.to(tl.float32)
tmp13 = tl.load(in_ptr4 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0)
tmp14 = tmp12 - tmp13
tmp15 = tmp8 * tmp14
tmp16 = tl.where(tmp2, tmp15, 0)
_tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17)
tmp10 = tl.sum(_tmp10, 1)[:, None]
tl.store(out_ptr0 + x3, tmp10, xmask)
tmp17 = tl.sum(_tmp17, 1)[:, None]
tl.store(out_ptr1 + x3, tmp17, xmask)
def get_args():
arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_4 = rand_strided((1, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_176.run(*args, 728, 7720, grid=grid(728), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_176.benchmark_all_configs(*args, 728, 7720, grid=grid(728))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/6w/c6wk3ggkhqc4dpm4svd2nw2x7bsbl6xzqqaoct6gtlyubuviyxj6.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward
# aten._native_batch_norm_legit_functional => convert_element_type_76
# aten.add => add_365, add_370
# aten.native_batch_norm_backward => convert_element_type_727, mul_1045, mul_1046, sub_286, sub_288, sub_289
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_177 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_177(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 5619712
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 784) % 56
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
tmp3 = tl.load(in_ptr2 + (x3), None).to(tl.float32)
tmp6 = tl.load(in_ptr3 + (x3), None).to(tl.float32)
tmp8 = tl.load(in_ptr4 + (x1), None)
tmp10 = tl.load(in_ptr5 + (x1), None)
tmp13 = tl.load(in_ptr6 + (x1), None)
tmp18 = tl.load(in_ptr7 + (x1), None)
tmp21 = tl.load(in_ptr8 + (x1), None)
tmp2 = tmp0 + tmp1
tmp4 = tmp2 + tmp3
tmp5 = tmp4.to(tl.float32)
tmp7 = tmp6.to(tl.float32)
tmp9 = tmp7 - tmp8
tmp11 = 9.964923469387754e-06
tmp12 = tmp10 * tmp11
tmp14 = tmp13 * tmp13
tmp15 = tmp12 * tmp14
tmp16 = tmp9 * tmp15
tmp17 = tmp5 - tmp16
tmp19 = tmp18 * tmp11
tmp20 = tmp17 - tmp19
tmp22 = tmp13 * tmp21
tmp23 = tmp20 * tmp22
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp23, None)
def get_args():
arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_3 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_4 = rand_strided((1, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
arg_8 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
arg_9 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_177.run(*args, 5619712, grid=grid(5619712), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_177.benchmark_all_configs(*args, 5619712, grid=grid(5619712))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/za/czavvoeeqv63efdv32prg5yimwonjxhjdak5qwnkzrdkukuwv7ls.py
# Original ATen: aten.convolution_backward
# aten.convolution_backward => convolution_backward_124
triton_poi_fused_convolution_backward_178 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_178(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2809856
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 21952
x1 = (xindex // 21952)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (21952 + x0 + (43904*x1)), None)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, None)
def get_args():
arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((128, 28, 28, 28), (21952, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_convolution_backward_178.run(*args, 2809856, grid=grid(2809856), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_convolution_backward_178.benchmark_all_configs(*args, 2809856, grid=grid(2809856))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/4j/c4jmqpkqr4fnrmxwsxtkllpzeruqpmftklalesh626aid62ltfs5.py
# Original ATen: aten.convolution_backward
# aten.convolution_backward => convolution_backward_125
triton_poi_fused_convolution_backward_179 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_179(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2809856
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 21952
x1 = (xindex // 21952)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (43904*x1)), None)
tmp1 = tmp0.to(tl.float32)