shunting314/cndashefrq72i5al6aphyo3wfbfsvdbzk4o7bi7emcofrrwombnd.py

## cndashefrq72i5al6aphyo3wfbfsvdbzk4o7bi7emcofrrwombnd.py

from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from torch._inductor.utils import maybe_profile

from torch import empty_strided, as_strided, device
from torch._inductor.codecache import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels

aten = torch.ops.aten
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
async_compile = AsyncCompile()

import triton
import triton.language as tl
from torch._inductor.triton_heuristics import grid, start_graph, end_graph
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream


# kernel path: /tmp/torchinductor_shunting/qd/cqdbopslmbarnc23s33ybyghfqf2olis2okrueqdtwzk66npkoci.py
# Original ATen: aten._to_copy, aten.sum

# aten._to_copy => convert_element_type_403
# aten.sum => sum_1
triton_red_fused__to_copy_sum_0 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[1024, 128],
    reduction_hint=ReductionHint.OUTER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 3), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__to_copy_sum_0(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 1000
    rnumel = 128
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp1 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex
        tmp0 = tl.load(in_ptr0 + (x0 + (1000*r1)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        _tmp1 = tl.where(rmask & xmask, _tmp1 + tmp0, _tmp1)
    tmp1 = tl.sum(_tmp1, 1)[:, None]
    tmp2 = tmp1.to(tl.float32)
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp2, xmask)


def get_args():
    arg_0 = rand_strided((128, 1000), (1000, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1000,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__to_copy_sum_0.run(*args, 1000, 128, grid=grid(1000), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__to_copy_sum_0.benchmark_all_configs(*args, 1000, 128, grid=grid(1000))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/v4/cv4lm4gdpfxwjwbfwkrkjjepglxb33ctw5byvwc2ujd3ynyqgfob.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_402
triton_poi_fused__to_copy_1 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_1(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1536000
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None)


def get_args():
    arg_0 = rand_strided((1000, 1536), (1536, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1000, 1536), (1536, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_1.run(*args, 1536000, grid=grid(1536000), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_1.benchmark_all_configs(*args, 1536000, grid=grid(1536000))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/gw/cgw4eilm5jaevvb4jlwkfuyolkwuloinsrcuxsoiruu5pucw2qoo.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.div, aten.native_batch_norm_backward, aten.threshold_backward

# aten._native_batch_norm_legit_functional => convert_element_type_398
# aten.div => div
# aten.native_batch_norm_backward => convert_element_type_404, mul_470, mul_478, sub_58, sum_2, sum_3
# aten.threshold_backward => scalar_tensor, where
triton_red_fused__native_batch_norm_legit_functional_div_native_batch_norm_backward_threshold_backward_2 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[2048, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*i1', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_div_native_batch_norm_backward_threshold_backward_2(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 1536
    rnumel = 6272
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp7 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    tmp10 = tl.load(in_ptr3 + (x0), xmask)
    _tmp13 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 49
        r2 = (rindex // 49)
        tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (75264*r2)), rmask & xmask, eviction_policy='evict_last')
        tmp2 = tl.load(in_ptr1 + (x0 + (1536*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp8 = tl.load(in_ptr2 + (r1 + (49*x0) + (75264*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = 0.0
        tmp3 = 49.0
        tmp4 = tmp2 / tmp3
        tmp5 = tl.where(tmp0, tmp1, tmp4)
        tmp6 = tmp5.to(tl.float32)
        _tmp7 = tl.where(rmask & xmask, _tmp7 + tmp6, _tmp7)
        tmp9 = tmp8.to(tl.float32)
        tmp11 = tmp9 - tmp10
        tmp12 = tmp6 * tmp11
        _tmp13 = tl.where(rmask & xmask, _tmp13 + tmp12, _tmp13)
    tmp7 = tl.sum(_tmp7, 1)[:, None]
    tl.store(out_ptr0 + x0, tmp7, xmask)
    tmp13 = tl.sum(_tmp13, 1)[:, None]
    tl.store(out_ptr1 + x0, tmp13, xmask)
    tmp14 = tl.load(in_ptr4 + (x0), xmask)
    tmp15 = tmp13 * tmp14
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp15, xmask)


def get_args():
    arg_0 = rand_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda:0', dtype=torch.bool)
    arg_1 = rand_strided((128, 1536), (1536, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 1536, 1, 1), (1536, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_div_native_batch_norm_backward_threshold_backward_2.run(*args, 1536, 6272, grid=grid(1536), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_div_native_batch_norm_backward_threshold_backward_2.benchmark_all_configs(*args, 1536, 6272, grid=grid(1536))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/kf/ckfaohmrzufzdx2fsmml5navfg2efvvxodj5sf5pujefshnggoot.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.convolution_backward, aten.div, aten.native_batch_norm_backward, aten.threshold_backward

# aten._native_batch_norm_legit_functional => convert_element_type_398
# aten.convolution_backward => convolution_backward
# aten.div => div
# aten.native_batch_norm_backward => convert_element_type_404, convert_element_type_406, mul_476, mul_477, sub_58, sub_60, sub_61
# aten.threshold_backward => scalar_tensor, where
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_div_native_batch_norm_backward_threshold_backward_3 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*i1', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp16', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_div_native_batch_norm_backward_threshold_backward_3(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 9633792
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x4 = (xindex // 49)
    x1 = (xindex // 49) % 1536
    tmp0 = tl.load(in_ptr0 + (x3), None)
    tmp2 = tl.load(in_ptr1 + (x4), None).to(tl.float32)
    tmp7 = tl.load(in_ptr2 + (x3), None).to(tl.float32)
    tmp9 = tl.load(in_ptr3 + (x1), None)
    tmp11 = tl.load(in_ptr4 + (x1), None)
    tmp14 = tl.load(in_ptr5 + (x1), None)
    tmp19 = tl.load(in_ptr6 + (x1), None)
    tmp22 = tl.load(in_ptr7 + (x1), None)
    tmp1 = 0.0
    tmp3 = 49.0
    tmp4 = tmp2 / tmp3
    tmp5 = tl.where(tmp0, tmp1, tmp4)
    tmp6 = tmp5.to(tl.float32)
    tmp8 = tmp7.to(tl.float32)
    tmp10 = tmp8 - tmp9
    tmp12 = 0.00015943877551020407
    tmp13 = tmp11 * tmp12
    tmp15 = tmp14 * tmp14
    tmp16 = tmp13 * tmp15
    tmp17 = tmp10 * tmp16
    tmp18 = tmp6 - tmp17
    tmp20 = tmp19 * tmp12
    tmp21 = tmp18 - tmp20
    tmp23 = tmp14 * tmp22
    tmp24 = tmp21 * tmp23
    tmp25 = tmp24.to(tl.float32)
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp25, None)


def get_args():
    arg_0 = rand_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda:0', dtype=torch.bool)
    arg_1 = rand_strided((128, 1536), (1536, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 1536, 1, 1), (1536, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_div_native_batch_norm_backward_threshold_backward_3.run(*args, 9633792, grid=grid(9633792), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_div_native_batch_norm_backward_threshold_backward_3.benchmark_all_configs(*args, 9633792, grid=grid(9633792))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/on/consb5x4mndq63g7yk4eommoi2zfnpkemgbkcrodivgrodwe4iek.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_407
triton_poi_fused__to_copy_4 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[524288], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_4(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 405504
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None)


def get_args():
    arg_0 = rand_strided((1536, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1536, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_4.run(*args, 405504, grid=grid(405504), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_4.benchmark_all_configs(*args, 405504, grid=grid(405504))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/aw/cawn5o3lch25tbuhghexsvhkzdksvz4ofqrpjmzgs2y6ldb5vypm.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_395
# aten.native_batch_norm_backward => convert_element_type_408, mul_479, mul_487, sub_62, sum_4, sum_5
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_5 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[512, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 8), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_5(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 264
    rnumel = 6272
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    tmp5 = tl.load(in_ptr2 + (x0), xmask)
    _tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 49
        r2 = (rindex // 49)
        tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp3 = tl.load(in_ptr1 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
        tmp4 = tmp3.to(tl.float32)
        tmp6 = tmp4 - tmp5
        tmp7 = tmp1 * tmp6
        _tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8)
    tmp2 = tl.sum(_tmp2, 1)[:, None]
    tl.store(out_ptr0 + x0, tmp2, xmask)
    tmp8 = tl.sum(_tmp8, 1)[:, None]
    tl.store(out_ptr1 + x0, tmp8, xmask)
    tmp9 = tl.load(in_ptr3 + (x0), xmask)
    tmp10 = tmp8 * tmp9
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)


def get_args():
    arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_5.run(*args, 264, 6272, grid=grid(264), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_5.benchmark_all_configs(*args, 264, 6272, grid=grid(264))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/rk/crkzi7f3ev6knrr3kmx5lxzwin43giwhtaru4yx4tnkfj3r3ngqj.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_395
# aten.native_batch_norm_backward => convert_element_type_408, convert_element_type_410, mul_485, mul_486, sub_62, sub_64, sub_65
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_6 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp16', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_6(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1655808
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 49) % 264
    tmp0 = tl.load(in_ptr0 + (x3), xmask).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x3), xmask).to(tl.float32)
    tmp4 = tl.load(in_ptr2 + (x1), xmask)
    tmp6 = tl.load(in_ptr3 + (x1), xmask)
    tmp9 = tl.load(in_ptr4 + (x1), xmask)
    tmp14 = tl.load(in_ptr5 + (x1), xmask)
    tmp17 = tl.load(in_ptr6 + (x1), xmask)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp2.to(tl.float32)
    tmp5 = tmp3 - tmp4
    tmp7 = 0.00015943877551020407
    tmp8 = tmp6 * tmp7
    tmp10 = tmp9 * tmp9
    tmp11 = tmp8 * tmp10
    tmp12 = tmp5 * tmp11
    tmp13 = tmp1 - tmp12
    tmp15 = tmp14 * tmp7
    tmp16 = tmp13 - tmp15
    tmp18 = tmp9 * tmp17
    tmp19 = tmp16 * tmp18
    tmp20 = tmp19.to(tl.float32)
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp20, xmask)


def get_args():
    arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_6.run(*args, 1655808, grid=grid(1655808), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_6.benchmark_all_configs(*args, 1655808, grid=grid(1655808))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/sf/csf7df3ryvp5mqalyhiqr2oqixspuwgupmdvdghhbdv4hoavgvib.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_411
triton_poi_fused__to_copy_7 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_7(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 104544
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_7.run(*args, 104544, grid=grid(104544), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_7.benchmark_all_configs(*args, 104544, grid=grid(104544))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ph/cphopjqs73kagdjs5yonh2gxou75xcoyv4vg5cyr3nkxcgkuz4lp.py
# Original ATen: aten.cat

# aten.cat => cat_41
triton_poi_fused_cat_8 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_8(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 4967424
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 38808
    x1 = (xindex // 38808)
    tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (77616*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask)


def get_args():
    arg_0 = rand_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 792, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_8.run(*args, 4967424, grid=grid(4967424), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_8.benchmark_all_configs(*args, 4967424, grid=grid(4967424))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/cd/ccdy3uu6zwicm5gp4nqvsu7scphal4x56e6djga5sd2lge4kiyge.py
# Original ATen: aten.cat

# aten.cat => cat_41
triton_poi_fused_cat_9 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_9(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 4967424
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 38808
    x1 = (xindex // 38808)
    tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (77616*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask)


def get_args():
    arg_0 = rand_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 792, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_9.run(*args, 4967424, grid=grid(4967424), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_9.benchmark_all_configs(*args, 4967424, grid=grid(4967424))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/nt/cnt4hzkf7uueins7in5idjrq44bsdsgn6e67gfrc7gdtwfyotkdg.py
# Original ATen: aten.mul, aten.sigmoid, aten.sigmoid_backward, aten.silu, aten.sum

# aten.mul => mul_488
# aten.sigmoid => sigmoid_63
# aten.sigmoid_backward => convert_element_type_413, convert_element_type_414, convert_element_type_415, mul_490, mul_491, sub_66
# aten.silu => convert_element_type_385, convert_element_type_386, mul_453, sigmoid_61
# aten.sum => sum_6
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_10 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[262144, 64],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_10(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 202752
    rnumel = 49
    RBLOCK: tl.constexpr = 64
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (r1 + (49*x0)), rmask, other=0).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (r1 + (49*x0)), rmask, other=0).to(tl.float32)
    tmp11 = tl.load(in_ptr2 + (x0), None).to(tl.float32)
    tmp2 = tmp1.to(tl.float32)
    tmp3 = tl.sigmoid(tmp2)
    tmp4 = tmp2 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tmp6 = tmp0 * tmp5
    tmp8 = tl.where(rmask, tmp6, 0)
    tmp9 = tl.sum(tmp8, 1)[:, None]
    tmp10 = tmp9.to(tl.float32)
    tmp12 = tl.sigmoid(tmp11)
    tmp13 = tmp12.to(tl.float32)
    tmp14 = 1.0
    tmp15 = tmp14 - tmp13
    tmp16 = tmp13 * tmp15
    tmp17 = tmp10 * tmp16
    tmp18 = tmp17.to(tl.float32)
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp18, None)


def get_args():
    arg_0 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_10.run(*args, 202752, 49, grid=grid(202752), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_10.benchmark_all_configs(*args, 202752, 49, grid=grid(202752))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/wr/cwr445l7k4eo6gcjw7l27usycvmf5jtqzbzk4btgcg2wqemciphj.py
# Original ATen: aten._to_copy, aten.convolution_backward

# aten._to_copy => convert_element_type_417
# aten.convolution_backward => sum_7
triton_per_fused__to_copy_convolution_backward_11 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[2048, 128],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__to_copy_convolution_backward_11(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 1584
    rnumel = 128
    RBLOCK: tl.constexpr = 128
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (1584*r1)), rmask & xmask, other=0).to(tl.float32)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)


def get_args():
    arg_0 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__to_copy_convolution_backward_11.run(*args, 1584, 128, grid=grid(1584), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__to_copy_convolution_backward_11.benchmark_all_configs(*args, 1584, 128, grid=grid(1584))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/vy/cvylenxoapd6e7xfcegwowbaxf7tmjjits4rifowpwoo5xytchjc.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_416
triton_poi_fused__to_copy_12 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_12(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 209088
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((1584, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1584, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_12.run(*args, 209088, grid=grid(209088), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_12.benchmark_all_configs(*args, 209088, grid=grid(209088))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ku/ckublkguiu5gw5vxkulgbwk62mp5fhr6syry2wrk56p3zdpzupzp.py
# Original ATen: aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub

# aten.add => add_304
# aten.clone => clone_47
# aten.fill => full_like
# aten.mul => mul_492, mul_493, mul_494
# aten.sigmoid => sigmoid_64
# aten.sub => sub_67
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_13 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_add_clone_fill_mul_sigmoid_sub_13(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 16896
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = 1.0
    tmp4 = tmp3 - tmp2
    tmp5 = tmp1 * tmp4
    tmp6 = tmp5 + tmp3
    tmp7 = tmp2 * tmp6
    tmp8 = tmp0 * tmp7
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp8, xmask)


def get_args():
    arg_0 = rand_strided((128, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_add_clone_fill_mul_sigmoid_sub_13.run(*args, 16896, grid=grid(16896), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_add_clone_fill_mul_sigmoid_sub_13.benchmark_all_configs(*args, 16896, grid=grid(16896))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/4b/c4bhcaxi2hruo5hil4xfrkdmmucojrb5nsitzxo2oy2a6qcdg3ti.py
# Original ATen: aten._to_copy, aten.convolution_backward

# aten._to_copy => convert_element_type_419
# aten.convolution_backward => sum_8
triton_per_fused__to_copy_convolution_backward_14 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[256, 128],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__to_copy_convolution_backward_14(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 132
    rnumel = 128
    RBLOCK: tl.constexpr = 128
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (132*r1)), rmask & xmask, other=0).to(tl.float32)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)


def get_args():
    arg_0 = rand_strided((128, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((132,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__to_copy_convolution_backward_14.run(*args, 132, 128, grid=grid(132), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__to_copy_convolution_backward_14.benchmark_all_configs(*args, 132, 128, grid=grid(132))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/mo/cmoc7y64jtpxuncjb73voe54cmkijbltrpgf5p7stvr4mny7avka.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_418
triton_poi_fused__to_copy_15 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_15(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 209088
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((132, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((132, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_15.run(*args, 209088, grid=grid(209088), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_15.benchmark_all_configs(*args, 209088, grid=grid(209088))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/et/cetgnuzvhwgtkuxuh7tdewyrx6qabbxsldzd2qnxluqsqq7zwwfh.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub

# aten._native_batch_norm_legit_functional => convert_element_type_383
# aten.add => add_305, add_306
# aten.clone => clone_46
# aten.div => div_1
# aten.fill => full_like_1
# aten.mul => mul_489, mul_495, mul_496, mul_497
# aten.native_batch_norm_backward => convert_element_type_420, mul_498, mul_506, sub_69, sum_10, sum_9
# aten.sigmoid => sigmoid_63, sigmoid_65
# aten.sub => sub_68
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_16 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[2048, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_16(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 1584
    rnumel = 6272
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    tmp20 = tl.load(in_ptr5 + (x0), xmask)
    _tmp23 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 49
        r2 = (rindex // 49)
        tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tl.load(in_ptr1 + (x0 + (1584*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp4 = tl.load(in_ptr2 + (x0 + (1584*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp8 = tl.load(in_ptr3 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp18 = tl.load(in_ptr4 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp2 = tl.sigmoid(tmp1)
        tmp3 = tmp0 * tmp2
        tmp5 = 49.0
        tmp6 = tmp4 / tmp5
        tmp7 = tmp3 + tmp6
        tmp9 = tl.sigmoid(tmp8)
        tmp10 = 1.0
        tmp11 = tmp10 - tmp9
        tmp12 = tmp8 * tmp11
        tmp13 = tmp12 + tmp10
        tmp14 = tmp9 * tmp13
        tmp15 = tmp7 * tmp14
        tmp16 = tmp15.to(tl.float32)
        _tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17)
        tmp19 = tmp18.to(tl.float32)
        tmp21 = tmp19 - tmp20
        tmp22 = tmp16 * tmp21
        _tmp23 = tl.where(rmask & xmask, _tmp23 + tmp22, _tmp23)
    tmp17 = tl.sum(_tmp17, 1)[:, None]
    tl.store(out_ptr0 + x0, tmp17, xmask)
    tmp23 = tl.sum(_tmp23, 1)[:, None]
    tl.store(out_ptr1 + x0, tmp23, xmask)
    tmp24 = tl.load(in_ptr6 + (x0), xmask)
    tmp25 = tmp23 * tmp24
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp25, xmask)


def get_args():
    arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_5 = rand_strided((1, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    arg_9 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_16.run(*args, 1584, 6272, grid=grid(1584), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_16.benchmark_all_configs(*args, 1584, 6272, grid=grid(1584))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/vq/cvqzgmyak74w6zf55f32ognvdbwijbc4xvxna4rdk6flp32c4rx3.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub

# aten._native_batch_norm_legit_functional => convert_element_type_383
# aten.add => add_305, add_306
# aten.clone => clone_46
# aten.div => div_1
# aten.fill => full_like_1
# aten.mul => mul_489, mul_495, mul_496, mul_497
# aten.native_batch_norm_backward => convert_element_type_420, mul_504, sub_69, sub_71, sub_72
# aten.sigmoid => sigmoid_63, sigmoid_65
# aten.sub => sub_68
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_17 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_17(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 9934848
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x4 = (xindex // 49)
    x1 = (xindex // 49) % 1584
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32)
    tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32)
    tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32)
    tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32)
    tmp19 = tl.load(in_ptr5 + (x1), None)
    tmp21 = tl.load(in_ptr6 + (x1), None)
    tmp24 = tl.load(in_ptr7 + (x1), None)
    tmp29 = tl.load(in_ptr8 + (x1), None)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp0 * tmp2
    tmp5 = 49.0
    tmp6 = tmp4 / tmp5
    tmp7 = tmp3 + tmp6
    tmp9 = tl.sigmoid(tmp8)
    tmp10 = 1.0
    tmp11 = tmp10 - tmp9
    tmp12 = tmp8 * tmp11
    tmp13 = tmp12 + tmp10
    tmp14 = tmp9 * tmp13
    tmp15 = tmp7 * tmp14
    tmp16 = tmp15.to(tl.float32)
    tmp18 = tmp17.to(tl.float32)
    tmp20 = tmp18 - tmp19
    tmp22 = 0.00015943877551020407
    tmp23 = tmp21 * tmp22
    tmp25 = tmp24 * tmp24
    tmp26 = tmp23 * tmp25
    tmp27 = tmp20 * tmp26
    tmp28 = tmp16 - tmp27
    tmp30 = tmp29 * tmp22
    tmp31 = tmp28 - tmp30
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp31, None)


def get_args():
    arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_5 = rand_strided((1, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    arg_9 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_17.run(*args, 9934848, grid=grid(9934848), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_17.benchmark_all_configs(*args, 9934848, grid=grid(9934848))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/4k/c4k5hrio7ng5ql4zfsckty7aijjdguidt3u7uu2czayedkp6afcd.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_5
triton_poi_fused_convolution_backward_18 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_18(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2483712
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = (xindex // 19404)
    x3 = xindex % 19404
    x1 = (xindex // 49) % 396
    x4 = xindex
    tmp0 = tl.load(in_ptr0 + (58212 + x3 + (77616*x2)), xmask)
    tmp1 = tl.load(in_ptr1 + (1188 + x1), xmask)
    tmp2 = tl.load(in_ptr2 + (1188 + x1), xmask)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp0 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, xmask)


def get_args():
    arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_18.run(*args, 2483712, grid=grid(2483712), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_18.benchmark_all_configs(*args, 2483712, grid=grid(2483712))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/le/clevrgmaajodoycxcxjtmbwkkhfu2zyqxjmzkxois2uwwej5n47c.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_423
triton_poi_fused__to_copy_19 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_19(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 32076
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((396, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((396, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_19.run(*args, 32076, grid=grid(32076), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_19.benchmark_all_configs(*args, 32076, grid=grid(32076))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/yx/cyxchtoyxooc3k4ah4othvbsdd77dfkbeykdlyzawbwqv4zeq227.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_6
triton_poi_fused_convolution_backward_20 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_20(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2483712
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = (xindex // 19404)
    x3 = xindex % 19404
    x1 = (xindex // 49) % 396
    x4 = xindex
    tmp0 = tl.load(in_ptr0 + (38808 + x3 + (77616*x2)), xmask)
    tmp1 = tl.load(in_ptr1 + (792 + x1), xmask)
    tmp2 = tl.load(in_ptr2 + (792 + x1), xmask)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp0 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, xmask)


def get_args():
    arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_20.run(*args, 2483712, grid=grid(2483712), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_20.benchmark_all_configs(*args, 2483712, grid=grid(2483712))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/eb/cebicyjdxyev2ctnv4eqoljt3ynagd7ac2xafkomzznzw2akhxue.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_424
triton_poi_fused__to_copy_21 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_21(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 19404
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((396, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((396, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_21.run(*args, 19404, grid=grid(19404), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_21.benchmark_all_configs(*args, 19404, grid=grid(19404))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/n6/cn6zlzicefkme4u5z2u7vgzu7zs36znck62asy3a53dhtgwtnfey.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_7
triton_poi_fused_convolution_backward_22 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_22(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2483712
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = (xindex // 19404)
    x3 = xindex % 19404
    x1 = (xindex // 49) % 396
    x4 = xindex
    tmp0 = tl.load(in_ptr0 + (19404 + x3 + (77616*x2)), xmask)
    tmp1 = tl.load(in_ptr1 + (396 + x1), xmask)
    tmp2 = tl.load(in_ptr2 + (396 + x1), xmask)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp0 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, xmask)


def get_args():
    arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_22.run(*args, 2483712, grid=grid(2483712), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_22.benchmark_all_configs(*args, 2483712, grid=grid(2483712))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/5j/c5jiqoftvlcj3ob2vawgml2t4mfu74vouj3eu6ozelrenm5a7frc.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_425
triton_poi_fused__to_copy_23 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_23(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 9900
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((396, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((396, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_23.run(*args, 9900, grid=grid(9900), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_23.benchmark_all_configs(*args, 9900, grid=grid(9900))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/wd/cwd5qp57lsltp4fq4manegrstgskzimmaukeha4dh4fnqhtwaydf.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_8
triton_poi_fused_convolution_backward_24 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_24(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2483712
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = (xindex // 19404)
    x3 = xindex % 19404
    x1 = (xindex // 49) % 396
    x4 = xindex
    tmp0 = tl.load(in_ptr0 + (x3 + (77616*x2)), xmask)
    tmp1 = tl.load(in_ptr1 + (x1), xmask)
    tmp2 = tl.load(in_ptr2 + (x1), xmask)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp0 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, xmask)


def get_args():
    arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_24.run(*args, 2483712, grid=grid(2483712), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_24.benchmark_all_configs(*args, 2483712, grid=grid(2483712))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/wc/cwc7i4csdkpcwn5beyvhsemwhtbq7gxbezkowlgoa2xhw4y7krz2.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_426
triton_poi_fused__to_copy_25 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_25(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3564
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((396, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((396, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_25.run(*args, 3564, grid=grid(3564), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_25.benchmark_all_configs(*args, 3564, grid=grid(3564))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/cp/ccpi5jja3x5efmayn24o6k4nqs6v2zc5jff75of6woeln6emtny2.py
# Original ATen: aten.cat

# aten.cat => cat_42
triton_poi_fused_cat_26 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_26(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2483712
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 19404
    x1 = (xindex // 19404)
    tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (77616*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask)


def get_args():
    arg_0 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 396, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_26.run(*args, 2483712, grid=grid(2483712), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_26.benchmark_all_configs(*args, 2483712, grid=grid(2483712))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/rg/crgc5ihkw5zltfd3tfchnahu2yx2wga4mambpixjsb7mmeynkmrh.py
# Original ATen: aten.cat

# aten.cat => cat_42
triton_poi_fused_cat_27 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_27(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2483712
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 19404
    x1 = (xindex // 19404)
    tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (77616*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask)


def get_args():
    arg_0 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 396, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_27.run(*args, 2483712, grid=grid(2483712), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_27.benchmark_all_configs(*args, 2483712, grid=grid(2483712))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/hq/chqi5nwugaohgw3qon7t3fytiqnvkhkemtghder4rphksh2dt7uc.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mul, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_375
# aten.mul => mul_509
# aten.native_batch_norm_backward => convert_element_type_427, mul_510, mul_518, sub_74, sum_11, sum_12
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_28 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[2048, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_28(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 1584
    rnumel = 6272
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    tmp7 = tl.load(in_ptr3 + (x0), xmask)
    _tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 49
        r2 = (rindex // 49)
        tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tl.load(in_ptr1 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp5 = tl.load(in_ptr2 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp2 = tmp0 * tmp1
        tmp3 = tmp2.to(tl.float32)
        _tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4)
        tmp6 = tmp5.to(tl.float32)
        tmp8 = tmp6 - tmp7
        tmp9 = tmp3 * tmp8
        _tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10)
    tmp4 = tl.sum(_tmp4, 1)[:, None]
    tl.store(out_ptr0 + x0, tmp4, xmask)
    tmp10 = tl.sum(_tmp10, 1)[:, None]
    tl.store(out_ptr1 + x0, tmp10, xmask)
    tmp11 = tl.load(in_ptr4 + (x0), xmask)
    tmp12 = tmp10 * tmp11
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp12, xmask)


def get_args():
    arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_28.run(*args, 1584, 6272, grid=grid(1584), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_28.benchmark_all_configs(*args, 1584, 6272, grid=grid(1584))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/4i/c4iqo4zwyswlhwfnswymdycvvcfufifa735bkh3k5fjuqlgdvt3q.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.convolution_backward, aten.mul, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_375
# aten.convolution_backward => convolution_backward_9
# aten.mul => mul_509
# aten.native_batch_norm_backward => convert_element_type_427, convert_element_type_429, mul_516, mul_517, sub_74, sub_76, sub_77
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_29 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_29(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr):
    xnumel = 9934848
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 49) % 1584
    tmp0 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32)
    tmp1 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp4 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
    tmp6 = tl.load(in_ptr2 + (x1), None)
    tmp8 = tl.load(in_ptr3 + (x1), None)
    tmp11 = tl.load(in_ptr4 + (x1), None)
    tmp16 = tl.load(in_ptr5 + (x1), None)
    tmp19 = tl.load(in_ptr6 + (x1), None)
    tmp2 = tmp0 * tmp1
    tmp3 = tmp2.to(tl.float32)
    tmp5 = tmp4.to(tl.float32)
    tmp7 = tmp5 - tmp6
    tmp9 = 0.00015943877551020407
    tmp10 = tmp8 * tmp9
    tmp12 = tmp11 * tmp11
    tmp13 = tmp10 * tmp12
    tmp14 = tmp7 * tmp13
    tmp15 = tmp3 - tmp14
    tmp17 = tmp16 * tmp9
    tmp18 = tmp15 - tmp17
    tmp20 = tmp11 * tmp19
    tmp21 = tmp18 * tmp20
    tmp22 = tmp21.to(tl.float32)
    tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None)


def get_args():
    arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_29.run(*args, 9934848, grid=grid(9934848), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_29.benchmark_all_configs(*args, 9934848, grid=grid(9934848))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/np/cnpskkb6q6odcd2dqw7z7r4e7p4cgfh5g7s5kyt2zmqwzsyn7azj.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_430
triton_poi_fused__to_copy_30 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[524288], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_30(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 418176
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((1584, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1584, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_30.run(*args, 418176, grid=grid(418176), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_30.benchmark_all_configs(*args, 418176, grid=grid(418176))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ut/cuta2ldfadubtnwekrao3mdwks6gtwerw2rxvha7uaqnloaeqvfv.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_372
# aten.add => add_308
# aten.native_batch_norm_backward => convert_element_type_431, mul_519, mul_527, sub_78, sum_13, sum_14
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_31 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[512, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_31(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 264
    rnumel = 6272
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    tmp7 = tl.load(in_ptr3 + (x0), xmask)
    _tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 49
        r2 = (rindex // 49)
        tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tl.load(in_ptr1 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp5 = tl.load(in_ptr2 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp2 = tmp0 + tmp1
        tmp3 = tmp2.to(tl.float32)
        _tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4)
        tmp6 = tmp5.to(tl.float32)
        tmp8 = tmp6 - tmp7
        tmp9 = tmp3 * tmp8
        _tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10)
    tmp4 = tl.sum(_tmp4, 1)[:, None]
    tl.store(out_ptr0 + x0, tmp4, xmask)
    tmp10 = tl.sum(_tmp10, 1)[:, None]
    tl.store(out_ptr1 + x0, tmp10, xmask)
    tmp11 = tl.load(in_ptr4 + (x0), xmask)
    tmp12 = tmp10 * tmp11
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp12, xmask)


def get_args():
    arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_31.run(*args, 264, 6272, grid=grid(264), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_31.benchmark_all_configs(*args, 264, 6272, grid=grid(264))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/a3/ca33o5ukggucnj2hrtjksvzkzfzfe3v4qnwzz5ycytjwaauqcvop.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_372
# aten.add => add_308
# aten.native_batch_norm_backward => convert_element_type_431, convert_element_type_433, mul_525, mul_526, sub_78, sub_80, sub_81
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_32 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp16', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_32(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1655808
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 49) % 264
    tmp0 = tl.load(in_ptr0 + (x3), xmask).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (x3), xmask).to(tl.float32)
    tmp4 = tl.load(in_ptr2 + (x3), xmask).to(tl.float32)
    tmp6 = tl.load(in_ptr3 + (x1), xmask)
    tmp8 = tl.load(in_ptr4 + (x1), xmask)
    tmp11 = tl.load(in_ptr5 + (x1), xmask)
    tmp16 = tl.load(in_ptr6 + (x1), xmask)
    tmp19 = tl.load(in_ptr7 + (x1), xmask)
    tmp2 = tmp0 + tmp1
    tmp3 = tmp2.to(tl.float32)
    tmp5 = tmp4.to(tl.float32)
    tmp7 = tmp5 - tmp6
    tmp9 = 0.00015943877551020407
    tmp10 = tmp8 * tmp9
    tmp12 = tmp11 * tmp11
    tmp13 = tmp10 * tmp12
    tmp14 = tmp7 * tmp13
    tmp15 = tmp3 - tmp14
    tmp17 = tmp16 * tmp9
    tmp18 = tmp15 - tmp17
    tmp20 = tmp11 * tmp19
    tmp21 = tmp18 * tmp20
    tmp22 = tmp21.to(tl.float32)
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, xmask)


def get_args():
    arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_32.run(*args, 1655808, grid=grid(1655808), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_32.benchmark_all_configs(*args, 1655808, grid=grid(1655808))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/hd/chdicakkg6zydfy4ll5rit2mnugrbccllk6kssfzzb7i55y4shxu.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_349
# aten.add => add_308, add_313
# aten.native_batch_norm_backward => convert_element_type_454, mul_559, mul_567, sub_94, sum_22, sum_23
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_33 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[512, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: 'i32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 10), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_33(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 264
    rnumel = 6272
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    tmp9 = tl.load(in_ptr4 + (x0), xmask)
    _tmp12 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 49
        r2 = (rindex // 49)
        tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tl.load(in_ptr1 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp3 = tl.load(in_ptr2 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp7 = tl.load(in_ptr3 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp2 = tmp0 + tmp1
        tmp4 = tmp2 + tmp3
        tmp5 = tmp4.to(tl.float32)
        _tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6)
        tmp8 = tmp7.to(tl.float32)
        tmp10 = tmp8 - tmp9
        tmp11 = tmp5 * tmp10
        _tmp12 = tl.where(rmask & xmask, _tmp12 + tmp11, _tmp12)
    tmp6 = tl.sum(_tmp6, 1)[:, None]
    tl.store(out_ptr0 + x0, tmp6, xmask)
    tmp12 = tl.sum(_tmp12, 1)[:, None]
    tl.store(out_ptr1 + x0, tmp12, xmask)
    tmp13 = tl.load(in_ptr5 + (x0), xmask)
    tmp14 = tmp12 * tmp13
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp14, xmask)


def get_args():
    arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_33.run(*args, 264, 6272, grid=grid(264), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_33.benchmark_all_configs(*args, 264, 6272, grid=grid(264))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/bi/cbij3cjdqcet3nxl6uy6e6kdaxljedzlmbuwgif6yfmwghh7x42z.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_349
# aten.add => add_308, add_313
# aten.native_batch_norm_backward => convert_element_type_454, mul_565, mul_566, sub_94, sub_96, sub_97
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_34 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_34(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1655808
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 49) % 264
    tmp0 = tl.load(in_ptr0 + (x3), xmask).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (x3), xmask).to(tl.float32)
    tmp3 = tl.load(in_ptr2 + (x3), xmask).to(tl.float32)
    tmp6 = tl.load(in_ptr3 + (x3), xmask).to(tl.float32)
    tmp8 = tl.load(in_ptr4 + (x1), xmask)
    tmp10 = tl.load(in_ptr5 + (x1), xmask)
    tmp13 = tl.load(in_ptr6 + (x1), xmask)
    tmp18 = tl.load(in_ptr7 + (x1), xmask)
    tmp21 = tl.load(in_ptr8 + (x1), xmask)
    tmp2 = tmp0 + tmp1
    tmp4 = tmp2 + tmp3
    tmp5 = tmp4.to(tl.float32)
    tmp7 = tmp6.to(tl.float32)
    tmp9 = tmp7 - tmp8
    tmp11 = 0.00015943877551020407
    tmp12 = tmp10 * tmp11
    tmp14 = tmp13 * tmp13
    tmp15 = tmp12 * tmp14
    tmp16 = tmp9 * tmp15
    tmp17 = tmp5 - tmp16
    tmp19 = tmp18 * tmp11
    tmp20 = tmp17 - tmp19
    tmp22 = tmp13 * tmp21
    tmp23 = tmp20 * tmp22
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp23, xmask)


def get_args():
    arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_9 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_34.run(*args, 1655808, grid=grid(1655808), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_34.benchmark_all_configs(*args, 1655808, grid=grid(1655808))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/bp/cbpqapqwkj3u7lbjggnc6sqbtdedbchks5ecsczowcbbvpcuekir.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_19
triton_poi_fused_convolution_backward_35 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[1048576], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_35(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 827904
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 6468
    x1 = (xindex // 6468)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (6468 + x0 + (12936*x1)), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((128, 132, 7, 7), (6468, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_35.run(*args, 827904, grid=grid(827904), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_35.benchmark_all_configs(*args, 827904, grid=grid(827904))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/vz/cvzmxblv4iaxi55wdmzkqdehpwn2a2c5kowaaewxyzttkqwmv7jg.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_20
triton_poi_fused_convolution_backward_36 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[1048576], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_36(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 827904
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 6468
    x1 = (xindex // 6468)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (12936*x1)), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((128, 132, 7, 7), (6468, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_36.run(*args, 827904, grid=grid(827904), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_36.benchmark_all_configs(*args, 827904, grid=grid(827904))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/sb/csb7dsvsxbdzlofqw5ldfragxm7igefprtjmu6wizsp5shcyxiyp.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_326
# aten.add => add_308, add_313, add_318
# aten.native_batch_norm_backward => convert_element_type_477, mul_599, mul_607, sub_110, sum_31, sum_32
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_37 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[512, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_37(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 264
    rnumel = 6272
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    tmp11 = tl.load(in_ptr5 + (x0), xmask)
    _tmp14 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 49
        r2 = (rindex // 49)
        tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tl.load(in_ptr1 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp3 = tl.load(in_ptr2 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp5 = tl.load(in_ptr3 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp9 = tl.load(in_ptr4 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp2 = tmp0 + tmp1
        tmp4 = tmp2 + tmp3
        tmp6 = tmp4 + tmp5
        tmp7 = tmp6.to(tl.float32)
        _tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8)
        tmp10 = tmp9.to(tl.float32)
        tmp12 = tmp10 - tmp11
        tmp13 = tmp7 * tmp12
        _tmp14 = tl.where(rmask & xmask, _tmp14 + tmp13, _tmp14)
    tmp8 = tl.sum(_tmp8, 1)[:, None]
    tl.store(out_ptr0 + x0, tmp8, xmask)
    tmp14 = tl.sum(_tmp14, 1)[:, None]
    tl.store(out_ptr1 + x0, tmp14, xmask)
    tmp15 = tl.load(in_ptr6 + (x0), xmask)
    tmp16 = tmp14 * tmp15
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask)


def get_args():
    arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_5 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_9 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_37.run(*args, 264, 6272, grid=grid(264), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_37.benchmark_all_configs(*args, 264, 6272, grid=grid(264))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/gx/cgx3bvwbhn4322e66zkvf7xmr6ch3vg75ylfjn7yzts64xagqr2l.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.convolution_backward, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_326
# aten.add => add_308, add_313, add_318
# aten.convolution_backward => convolution_backward_28
# aten.native_batch_norm_backward => convert_element_type_477, convert_element_type_479, mul_605, mul_606, sub_110, sub_112, sub_113
triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_38 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: '*fp16', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_38(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, in_ptr9, out_ptr1, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1655808
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 49) % 264
    tmp0 = tl.load(in_ptr0 + (x3), xmask).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (x3), xmask).to(tl.float32)
    tmp3 = tl.load(in_ptr2 + (x3), xmask).to(tl.float32)
    tmp5 = tl.load(in_ptr3 + (x3), xmask).to(tl.float32)
    tmp8 = tl.load(in_ptr4 + (x3), xmask).to(tl.float32)
    tmp10 = tl.load(in_ptr5 + (x1), xmask)
    tmp12 = tl.load(in_ptr6 + (x1), xmask)
    tmp15 = tl.load(in_ptr7 + (x1), xmask)
    tmp20 = tl.load(in_ptr8 + (x1), xmask)
    tmp23 = tl.load(in_ptr9 + (x1), xmask)
    tmp2 = tmp0 + tmp1
    tmp4 = tmp2 + tmp3
    tmp6 = tmp4 + tmp5
    tmp7 = tmp6.to(tl.float32)
    tmp9 = tmp8.to(tl.float32)
    tmp11 = tmp9 - tmp10
    tmp13 = 0.00015943877551020407
    tmp14 = tmp12 * tmp13
    tmp16 = tmp15 * tmp15
    tmp17 = tmp14 * tmp16
    tmp18 = tmp11 * tmp17
    tmp19 = tmp7 - tmp18
    tmp21 = tmp20 * tmp13
    tmp22 = tmp19 - tmp21
    tmp24 = tmp15 * tmp23
    tmp25 = tmp22 * tmp24
    tmp26 = tmp25.to(tl.float32)
    tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp26, xmask)


def get_args():
    arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_5 = rand_strided((1, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_9 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_10 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, arg_10,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_38.run(*args, 1655808, grid=grid(1655808), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_38.benchmark_all_configs(*args, 1655808, grid=grid(1655808))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/hv/chvxalrqvfx3gumbgl7gg6d4lhqqg37lbadisxr7scsc3tjuvlp6.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_480
triton_poi_fused__to_copy_39 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_39(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 253440
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((264, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((264, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_39.run(*args, 253440, grid=grid(253440), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_39.benchmark_all_configs(*args, 253440, grid=grid(253440))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/4c/c4cxprfywuviqaunnoosyvctdmc2wg7xuk2qc4eawthvjfi2nh6c.py
# Original ATen: aten.mul, aten.sigmoid, aten.sigmoid_backward, aten.silu, aten.sum

# aten.mul => mul_608
# aten.sigmoid => sigmoid_51
# aten.sigmoid_backward => convert_element_type_481, convert_element_type_482, convert_element_type_483, mul_610, mul_611, sub_114
# aten.silu => convert_element_type_317, convert_element_type_318, mul_378, sigmoid_49
# aten.sum => sum_33
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_40 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[131072, 64],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_40(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 122880
    rnumel = 49
    RBLOCK: tl.constexpr = 64
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (r1 + (49*x0)), rmask, other=0).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (r1 + (49*x0)), rmask, other=0).to(tl.float32)
    tmp11 = tl.load(in_ptr2 + (x0), None).to(tl.float32)
    tmp2 = tmp1.to(tl.float32)
    tmp3 = tl.sigmoid(tmp2)
    tmp4 = tmp2 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tmp6 = tmp0 * tmp5
    tmp8 = tl.where(rmask, tmp6, 0)
    tmp9 = tl.sum(tmp8, 1)[:, None]
    tmp10 = tmp9.to(tl.float32)
    tmp12 = tl.sigmoid(tmp11)
    tmp13 = tmp12.to(tl.float32)
    tmp14 = 1.0
    tmp15 = tmp14 - tmp13
    tmp16 = tmp13 * tmp15
    tmp17 = tmp10 * tmp16
    tmp18 = tmp17.to(tl.float32)
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp18, None)


def get_args():
    arg_0 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_40.run(*args, 122880, 49, grid=grid(122880), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_40.benchmark_all_configs(*args, 122880, 49, grid=grid(122880))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/c3/cc3ozjdbl56ribso2ymcx7ozxswqoswegte5cm6ndwts2vdg2nx7.py
# Original ATen: aten._to_copy, aten.convolution_backward

# aten._to_copy => convert_element_type_485
# aten.convolution_backward => sum_34
triton_per_fused__to_copy_convolution_backward_41 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[1024, 128],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__to_copy_convolution_backward_41(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 960
    rnumel = 128
    RBLOCK: tl.constexpr = 128
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (960*r1)), rmask & xmask, other=0).to(tl.float32)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)


def get_args():
    arg_0 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__to_copy_convolution_backward_41.run(*args, 960, 128, grid=grid(960), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__to_copy_convolution_backward_41.benchmark_all_configs(*args, 960, 128, grid=grid(960))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/gr/cgrr3v6svqr3e2yhrucjdveyox77wsfeqkauijcny3rj2xckclad.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_484
triton_poi_fused__to_copy_42 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_42(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 76800
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((960, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((960, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_42.run(*args, 76800, grid=grid(76800), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_42.benchmark_all_configs(*args, 76800, grid=grid(76800))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ec/cecvpjtjrr7ig7pxbkmaghimjhrgacci4o2zw2nv422yxcdjktnf.py
# Original ATen: aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub

# aten.add => add_319
# aten.clone => clone_38
# aten.fill => full_like_9
# aten.mul => mul_612, mul_613, mul_614
# aten.sigmoid => sigmoid_73
# aten.sub => sub_115
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_43 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_add_clone_fill_mul_sigmoid_sub_43(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 10240
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_out_ptr0 + (x0), None).to(tl.float32)
    tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = 1.0
    tmp4 = tmp3 - tmp2
    tmp5 = tmp1 * tmp4
    tmp6 = tmp5 + tmp3
    tmp7 = tmp2 * tmp6
    tmp8 = tmp0 * tmp7
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp8, None)


def get_args():
    arg_0 = rand_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_add_clone_fill_mul_sigmoid_sub_43.run(*args, 10240, grid=grid(10240), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_add_clone_fill_mul_sigmoid_sub_43.benchmark_all_configs(*args, 10240, grid=grid(10240))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/tf/ctfzhkol6r3oykhxrse2zzxlztwo7vdgfbyprm7vgxoac6npm7jy.py
# Original ATen: aten._to_copy, aten.convolution_backward

# aten._to_copy => convert_element_type_487
# aten.convolution_backward => sum_35
triton_per_fused__to_copy_convolution_backward_44 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[128, 128],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__to_copy_convolution_backward_44(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 80
    rnumel = 128
    RBLOCK: tl.constexpr = 128
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (80*r1)), rmask & xmask, other=0).to(tl.float32)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)


def get_args():
    arg_0 = rand_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((80,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__to_copy_convolution_backward_44.run(*args, 80, 128, grid=grid(80), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__to_copy_convolution_backward_44.benchmark_all_configs(*args, 80, 128, grid=grid(80))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/pa/cpatc2nbbyd25gudjvjrjc54kh5oivkwx63acex4sswslv6ylzvi.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_486
triton_poi_fused__to_copy_45 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_45(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 76800
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((80, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((80, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_45.run(*args, 76800, grid=grid(76800), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_45.benchmark_all_configs(*args, 76800, grid=grid(76800))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/t2/ct2iy46nsnqymwwljjboc6oz6xdfqwrjdsjacocdhzt74zuufoj7.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub

# aten._native_batch_norm_legit_functional => convert_element_type_315
# aten.add => add_320, add_321
# aten.clone => clone_37
# aten.div => div_4
# aten.fill => full_like_10
# aten.mul => mul_609, mul_615, mul_616, mul_617
# aten.native_batch_norm_backward => convert_element_type_488, mul_618, mul_626, sub_117, sum_36, sum_37
# aten.sigmoid => sigmoid_51, sigmoid_74
# aten.sub => sub_116
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_46 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[1024, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_46(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 960
    rnumel = 6272
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    tmp20 = tl.load(in_ptr5 + (x0), xmask)
    _tmp23 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 49
        r2 = (rindex // 49)
        tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (47040*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tl.load(in_ptr1 + (x0 + (960*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp4 = tl.load(in_ptr2 + (x0 + (960*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp8 = tl.load(in_ptr3 + (r1 + (49*x0) + (47040*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp18 = tl.load(in_ptr4 + (r1 + (49*x0) + (47040*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp2 = tl.sigmoid(tmp1)
        tmp3 = tmp0 * tmp2
        tmp5 = 49.0
        tmp6 = tmp4 / tmp5
        tmp7 = tmp3 + tmp6
        tmp9 = tl.sigmoid(tmp8)
        tmp10 = 1.0
        tmp11 = tmp10 - tmp9
        tmp12 = tmp8 * tmp11
        tmp13 = tmp12 + tmp10
        tmp14 = tmp9 * tmp13
        tmp15 = tmp7 * tmp14
        tmp16 = tmp15.to(tl.float32)
        _tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17)
        tmp19 = tmp18.to(tl.float32)
        tmp21 = tmp19 - tmp20
        tmp22 = tmp16 * tmp21
        _tmp23 = tl.where(rmask & xmask, _tmp23 + tmp22, _tmp23)
    tmp17 = tl.sum(_tmp17, 1)[:, None]
    tl.store(out_ptr0 + x0, tmp17, xmask)
    tmp23 = tl.sum(_tmp23, 1)[:, None]
    tl.store(out_ptr1 + x0, tmp23, xmask)
    tmp24 = tl.load(in_ptr6 + (x0), xmask)
    tmp25 = tmp23 * tmp24
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp25, xmask)


def get_args():
    arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_5 = rand_strided((1, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_9 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_46.run(*args, 960, 6272, grid=grid(960), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_46.benchmark_all_configs(*args, 960, 6272, grid=grid(960))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/hn/chn3tshreitokdet2wgwgbsbuqqplci2k63sdatoy3lq6bjeylvs.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub

# aten._native_batch_norm_legit_functional => convert_element_type_315
# aten.add => add_320, add_321
# aten.clone => clone_37
# aten.div => div_4
# aten.fill => full_like_10
# aten.mul => mul_609, mul_615, mul_616, mul_617
# aten.native_batch_norm_backward => convert_element_type_488, mul_624, sub_117, sub_119, sub_120
# aten.sigmoid => sigmoid_51, sigmoid_74
# aten.sub => sub_116
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_47 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_47(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 6021120
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x4 = (xindex // 49)
    x1 = (xindex // 49) % 960
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32)
    tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32)
    tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32)
    tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32)
    tmp19 = tl.load(in_ptr5 + (x1), None)
    tmp21 = tl.load(in_ptr6 + (x1), None)
    tmp24 = tl.load(in_ptr7 + (x1), None)
    tmp29 = tl.load(in_ptr8 + (x1), None)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp0 * tmp2
    tmp5 = 49.0
    tmp6 = tmp4 / tmp5
    tmp7 = tmp3 + tmp6
    tmp9 = tl.sigmoid(tmp8)
    tmp10 = 1.0
    tmp11 = tmp10 - tmp9
    tmp12 = tmp8 * tmp11
    tmp13 = tmp12 + tmp10
    tmp14 = tmp9 * tmp13
    tmp15 = tmp7 * tmp14
    tmp16 = tmp15.to(tl.float32)
    tmp18 = tmp17.to(tl.float32)
    tmp20 = tmp18 - tmp19
    tmp22 = 0.00015943877551020407
    tmp23 = tmp21 * tmp22
    tmp25 = tmp24 * tmp24
    tmp26 = tmp23 * tmp25
    tmp27 = tmp20 * tmp26
    tmp28 = tmp16 - tmp27
    tmp30 = tmp29 * tmp22
    tmp31 = tmp28 - tmp30
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp31, None)


def get_args():
    arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_5 = rand_strided((1, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_9 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_47.run(*args, 6021120, grid=grid(6021120), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_47.benchmark_all_configs(*args, 6021120, grid=grid(6021120))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/cm/ccm5w6opbpyzojqp4qntjyz3d5dzh7ybk3ty3hgcp6xmwewzvdux.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_31
triton_poi_fused_convolution_backward_48 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_48(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1505280
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = (xindex // 11760)
    x3 = xindex % 11760
    x1 = (xindex // 49) % 240
    x4 = xindex
    tmp0 = tl.load(in_ptr0 + (35280 + x3 + (47040*x2)), None)
    tmp1 = tl.load(in_ptr1 + (720 + x1), None)
    tmp2 = tl.load(in_ptr2 + (720 + x1), None)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp0 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)


def get_args():
    arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((128, 240, 7, 7), (11760, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_48.run(*args, 1505280, grid=grid(1505280), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_48.benchmark_all_configs(*args, 1505280, grid=grid(1505280))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/bg/cbgklbbew275guj34lbxmgcsod4kcchr5225smzpdyhnn7zswlfw.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_491
triton_poi_fused__to_copy_49 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_49(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 19440
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((240, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((240, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_49.run(*args, 19440, grid=grid(19440), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_49.benchmark_all_configs(*args, 19440, grid=grid(19440))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/aa/caatae3nbvudrfnsq366oeojppmx457zjpvs4yumsehtdmsg23py.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_32
triton_poi_fused_convolution_backward_50 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_50(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1505280
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = (xindex // 11760)
    x3 = xindex % 11760
    x1 = (xindex // 49) % 240
    x4 = xindex
    tmp0 = tl.load(in_ptr0 + (23520 + x3 + (47040*x2)), None)
    tmp1 = tl.load(in_ptr1 + (480 + x1), None)
    tmp2 = tl.load(in_ptr2 + (480 + x1), None)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp0 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)


def get_args():
    arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((128, 240, 7, 7), (11760, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_50.run(*args, 1505280, grid=grid(1505280), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_50.benchmark_all_configs(*args, 1505280, grid=grid(1505280))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/iy/ciyzljcwrx3unj6zlla5nxhjqwjz6zppzmuz7wma6tcow2pjfjva.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_492
triton_poi_fused__to_copy_51 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_51(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 11760
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((240, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((240, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_51.run(*args, 11760, grid=grid(11760), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_51.benchmark_all_configs(*args, 11760, grid=grid(11760))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/6i/c6izu5jaho2aqwpjlvtonq5ar6inxincvvpnnrzzj3npyab2r3ga.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_33
triton_poi_fused_convolution_backward_52 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_52(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1505280
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = (xindex // 11760)
    x3 = xindex % 11760
    x1 = (xindex // 49) % 240
    x4 = xindex
    tmp0 = tl.load(in_ptr0 + (11760 + x3 + (47040*x2)), None)
    tmp1 = tl.load(in_ptr1 + (240 + x1), None)
    tmp2 = tl.load(in_ptr2 + (240 + x1), None)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp0 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)


def get_args():
    arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((128, 240, 7, 7), (11760, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_52.run(*args, 1505280, grid=grid(1505280), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_52.benchmark_all_configs(*args, 1505280, grid=grid(1505280))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/cq/ccq24kfa5wdpfsiaanwmmkcev4g2dsxwzynkl2fv553qth6ixrfz.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_493
triton_poi_fused__to_copy_53 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_53(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 6000
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((240, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((240, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_53.run(*args, 6000, grid=grid(6000), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_53.benchmark_all_configs(*args, 6000, grid=grid(6000))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/iu/ciu2jjei5ptmwjfdzudopltuvgs5jt6yeve7jcdvodi4ozwv2j6u.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_34
triton_poi_fused_convolution_backward_54 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_54(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1505280
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = (xindex // 11760)
    x3 = xindex % 11760
    x1 = (xindex // 49) % 240
    x4 = xindex
    tmp0 = tl.load(in_ptr0 + (x3 + (47040*x2)), None)
    tmp1 = tl.load(in_ptr1 + (x1), None)
    tmp2 = tl.load(in_ptr2 + (x1), None)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp0 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)


def get_args():
    arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((128, 240, 7, 7), (11760, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_54.run(*args, 1505280, grid=grid(1505280), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_54.benchmark_all_configs(*args, 1505280, grid=grid(1505280))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/vw/cvw5bsibqyft5fe6qyposdjgeixpccxa5ewo6nsyrtdsomfikzwt.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_494
triton_poi_fused__to_copy_55 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_55(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2160
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((240, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((240, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_55.run(*args, 2160, grid=grid(2160), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_55.benchmark_all_configs(*args, 2160, grid=grid(2160))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/zd/czdohgrrggyfbylj276pqwjy7zlv5b2cu55bzfn6nk26nmdmxmon.py
# Original ATen: aten.cat

# aten.cat => cat_47
triton_poi_fused_cat_56 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_56(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 6021120
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 47040
    x1 = (xindex // 47040)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (188160*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 240, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_56.run(*args, 6021120, grid=grid(6021120), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_56.benchmark_all_configs(*args, 6021120, grid=grid(6021120))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/6g/c6guclv6h7xk7kazbyjy6st57pkopg5lzsmit37ifpr7d46xtfho.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mul, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_307
# aten.mul => mul_629
# aten.native_batch_norm_backward => convert_element_type_495, mul_630, mul_638, sub_122, sum_38, sum_39
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_57 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[1024, 32768],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_57(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 960
    rnumel = 25088
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    tmp7 = tl.load(in_ptr3 + (x0), xmask)
    _tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 196
        r2 = (rindex // 196)
        tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tl.load(in_ptr1 + (r1 + (196*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp5 = tl.load(in_ptr2 + (r1 + (196*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp2 = tmp0 * tmp1
        tmp3 = tmp2.to(tl.float32)
        _tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4)
        tmp6 = tmp5.to(tl.float32)
        tmp8 = tmp6 - tmp7
        tmp9 = tmp3 * tmp8
        _tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10)
    tmp4 = tl.sum(_tmp4, 1)[:, None]
    tl.store(out_ptr0 + x0, tmp4, xmask)
    tmp10 = tl.sum(_tmp10, 1)[:, None]
    tl.store(out_ptr1 + x0, tmp10, xmask)
    tmp11 = tl.load(in_ptr4 + (x0), xmask)
    tmp12 = tmp10 * tmp11
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp12, xmask)


def get_args():
    arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_57.run(*args, 960, 25088, grid=grid(960), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_57.benchmark_all_configs(*args, 960, 25088, grid=grid(960))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/kq/ckqzmpbl3qlmroyipqv5z46lwldyab7axrjybp23plftx2joawdv.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.convolution_backward, aten.mul, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_307
# aten.convolution_backward => convolution_backward_35
# aten.mul => mul_629
# aten.native_batch_norm_backward => convert_element_type_495, convert_element_type_497, mul_636, mul_637, sub_122, sub_124, sub_125
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_58 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_58(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr):
    xnumel = 24084480
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 196) % 960
    tmp0 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32)
    tmp1 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp4 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
    tmp6 = tl.load(in_ptr2 + (x1), None)
    tmp8 = tl.load(in_ptr3 + (x1), None)
    tmp11 = tl.load(in_ptr4 + (x1), None)
    tmp16 = tl.load(in_ptr5 + (x1), None)
    tmp19 = tl.load(in_ptr6 + (x1), None)
    tmp2 = tmp0 * tmp1
    tmp3 = tmp2.to(tl.float32)
    tmp5 = tmp4.to(tl.float32)
    tmp7 = tmp5 - tmp6
    tmp9 = 3.985969387755102e-05
    tmp10 = tmp8 * tmp9
    tmp12 = tmp11 * tmp11
    tmp13 = tmp10 * tmp12
    tmp14 = tmp7 * tmp13
    tmp15 = tmp3 - tmp14
    tmp17 = tmp16 * tmp9
    tmp18 = tmp15 - tmp17
    tmp20 = tmp11 * tmp19
    tmp21 = tmp18 * tmp20
    tmp22 = tmp21.to(tl.float32)
    tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None)


def get_args():
    arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_58.run(*args, 24084480, grid=grid(24084480), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_58.benchmark_all_configs(*args, 24084480, grid=grid(24084480))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/v5/cv5qx4oyij3hkwbqtmi4skgsidu4qp4eyvxkday5hyy3p7hf74af.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_498
triton_poi_fused__to_copy_59 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_59(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 153600
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None)


def get_args():
    arg_0 = rand_strided((960, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((960, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_59.run(*args, 153600, grid=grid(153600), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_59.benchmark_all_configs(*args, 153600, grid=grid(153600))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/vo/cvowglnuesyuof2jgfmeotsjew6qfwzd2dbspy34jb56dgnewazc.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_304
# aten.native_batch_norm_backward => convert_element_type_499, mul_639, sub_126, sum_40, sum_41
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_60 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[1024, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_60(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 640
    rnumel = 6272
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex % 160
    x1 = (xindex // 160)
    _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    tmp5 = tl.load(in_ptr2 + (x0), xmask)
    _tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = tl.load(in_ptr0 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp3 = tl.load(in_ptr1 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
        tmp4 = tmp3.to(tl.float32)
        tmp6 = tmp4 - tmp5
        tmp7 = tmp1 * tmp6
        _tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8)
    tmp2 = tl.sum(_tmp2, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp2, xmask)
    tmp8 = tl.sum(_tmp8, 1)[:, None]
    tl.store(out_ptr1 + x3, tmp8, xmask)


def get_args():
    arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_60.run(*args, 640, 6272, grid=grid(640), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_60.benchmark_all_configs(*args, 640, 6272, grid=grid(640))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/v7/cv7kgtxh5cmxf5wfqyzhdvjjtikzk2pxpebuocdoaffoo64kkmaj.py
# Original ATen: aten.native_batch_norm_backward

# aten.native_batch_norm_backward => convert_element_type_499, sum_40
triton_per_fused_native_batch_norm_backward_61 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[256, 4],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}
)
@triton.jit
def triton_per_fused_native_batch_norm_backward_61(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 160
    rnumel = 4
    RBLOCK: tl.constexpr = 4
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (160*r1)), rmask & xmask, other=0)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tl.store(out_ptr0 + x0, tmp3, xmask)


def get_args():
    arg_0 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused_native_batch_norm_backward_61.run(*args, 160, 4, grid=grid(160), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused_native_batch_norm_backward_61.benchmark_all_configs(*args, 160, 4, grid=grid(160))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ie/cie5izdtlxlumenajauzzkl6hk7z6q5jgsvyedwv7n3ohhf6bq2s.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_304
# aten.native_batch_norm_backward => convert_element_type_499, mul_639, mul_647, sub_126, sum_41
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_62 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[256, 4],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_62(in_ptr0, in_ptr1, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 160
    rnumel = 4
    RBLOCK: tl.constexpr = 4
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (160*r1)), rmask & xmask, other=0)
    tmp4 = tl.load(in_ptr1 + (x0), xmask)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp5 = tmp3 * tmp4
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
    tl.store(out_ptr0 + x0, tmp3, xmask)


def get_args():
    arg_0 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_62.run(*args, 160, 4, grid=grid(160), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_62.benchmark_all_configs(*args, 160, 4, grid=grid(160))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/lu/clucluq3crlm7gshoftvwdbylf2hruep5zy2kdudpczi5ai6bwrm.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_304
# aten.native_batch_norm_backward => convert_element_type_499, convert_element_type_501, mul_645, mul_646, sub_126, sub_128, sub_129
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_63 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp16', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_63(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 4014080
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 196) % 160
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
    tmp4 = tl.load(in_ptr2 + (x1), None)
    tmp6 = tl.load(in_ptr3 + (x1), None)
    tmp9 = tl.load(in_ptr4 + (x1), None)
    tmp14 = tl.load(in_ptr5 + (x1), None)
    tmp17 = tl.load(in_ptr6 + (x1), None)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp2.to(tl.float32)
    tmp5 = tmp3 - tmp4
    tmp7 = 3.985969387755102e-05
    tmp8 = tmp6 * tmp7
    tmp10 = tmp9 * tmp9
    tmp11 = tmp8 * tmp10
    tmp12 = tmp5 * tmp11
    tmp13 = tmp1 - tmp12
    tmp15 = tmp14 * tmp7
    tmp16 = tmp13 - tmp15
    tmp18 = tmp9 * tmp17
    tmp19 = tmp16 * tmp18
    tmp20 = tmp19.to(tl.float32)
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp20, None)


def get_args():
    arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_63.run(*args, 4014080, grid=grid(4014080), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_63.benchmark_all_configs(*args, 4014080, grid=grid(4014080))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/u6/cu6t7jgofejrpr6jiujpu5wwc5mlffc5k4hwyiqtb7tiyjxa6ec4.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_502
triton_poi_fused__to_copy_64 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_64(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 19200
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_64.run(*args, 19200, grid=grid(19200), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_64.benchmark_all_configs(*args, 19200, grid=grid(19200))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/rj/crjqpfvxgdlaoka6zisl4dxwjeaxw453cmtsf23px34uebd6u4oo.py
# Original ATen: aten.cat

# aten.cat => cat_48
triton_poi_fused_cat_65 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_65(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 6021120
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 47040
    x1 = (xindex // 47040)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (94080*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 240, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_65.run(*args, 6021120, grid=grid(6021120), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_65.benchmark_all_configs(*args, 6021120, grid=grid(6021120))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/rc/crc555gyfmwt6m3oebfawzf77swacndeysz6litdvgmtz7aw3cnb.py
# Original ATen: aten.mul, aten.sigmoid, aten.sigmoid_backward, aten.silu, aten.sum

# aten.mul => mul_648
# aten.sigmoid => sigmoid_47
# aten.sigmoid_backward => convert_element_type_504, convert_element_type_505, convert_element_type_506, mul_650, mul_651, sub_130
# aten.silu => convert_element_type_294, convert_element_type_295, mul_353, sigmoid_45
# aten.sum => sum_42
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_66 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[65536, 256],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_66(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 61440
    rnumel = 196
    RBLOCK: tl.constexpr = 256
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (r1 + (196*x0)), rmask, other=0).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (r1 + (196*x0)), rmask, other=0).to(tl.float32)
    tmp11 = tl.load(in_ptr2 + (x0), None).to(tl.float32)
    tmp2 = tmp1.to(tl.float32)
    tmp3 = tl.sigmoid(tmp2)
    tmp4 = tmp2 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tmp6 = tmp0 * tmp5
    tmp8 = tl.where(rmask, tmp6, 0)
    tmp9 = tl.sum(tmp8, 1)[:, None]
    tmp10 = tmp9.to(tl.float32)
    tmp12 = tl.sigmoid(tmp11)
    tmp13 = tmp12.to(tl.float32)
    tmp14 = 1.0
    tmp15 = tmp14 - tmp13
    tmp16 = tmp13 * tmp15
    tmp17 = tmp10 * tmp16
    tmp18 = tmp17.to(tl.float32)
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp18, None)


def get_args():
    arg_0 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_66.run(*args, 61440, 196, grid=grid(61440), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_66.benchmark_all_configs(*args, 61440, 196, grid=grid(61440))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/n4/cn4fg2jpjbb2zgpwsyrzkkvrz4ql4zzxfplbwfmfeem7estzylry.py
# Original ATen: aten._to_copy, aten.convolution_backward

# aten._to_copy => convert_element_type_508
# aten.convolution_backward => sum_43
triton_per_fused__to_copy_convolution_backward_67 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[512, 128],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__to_copy_convolution_backward_67(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 480
    rnumel = 128
    RBLOCK: tl.constexpr = 128
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (480*r1)), rmask & xmask, other=0).to(tl.float32)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)


def get_args():
    arg_0 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__to_copy_convolution_backward_67.run(*args, 480, 128, grid=grid(480), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__to_copy_convolution_backward_67.benchmark_all_configs(*args, 480, 128, grid=grid(480))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/gx/cgx4he25cg6tmu62uuz3hbvmsgstxlvguwwpdavmmk5v7cb53khs.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_507
triton_poi_fused__to_copy_68 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_68(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 38400
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_68.run(*args, 38400, grid=grid(38400), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_68.benchmark_all_configs(*args, 38400, grid=grid(38400))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/57/c57nn4lxfd7xfnwbsod5xu57xcqkrswf7vdbyz76bditl462irc4.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_509
triton_poi_fused__to_copy_69 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_69(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 38400
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_69.run(*args, 38400, grid=grid(38400), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_69.benchmark_all_configs(*args, 38400, grid=grid(38400))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/5q/c5qqp6x25kopkdsx6gj3r2vuos77mpwiiyja2ob3sfsgwvxevysq.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub

# aten._native_batch_norm_legit_functional => convert_element_type_292
# aten.add => add_324, add_325
# aten.clone => clone_34
# aten.div => div_5
# aten.fill => full_like_13
# aten.mul => mul_649, mul_655, mul_656, mul_657
# aten.native_batch_norm_backward => convert_element_type_511, mul_658, mul_666, sub_133, sum_45, sum_46
# aten.sigmoid => sigmoid_47, sigmoid_77
# aten.sub => sub_132
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_70 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[512, 32768],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_70(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 480
    rnumel = 25088
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    tmp20 = tl.load(in_ptr5 + (x0), xmask)
    _tmp23 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 196
        r2 = (rindex // 196)
        tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tl.load(in_ptr1 + (x0 + (480*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp4 = tl.load(in_ptr2 + (x0 + (480*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp8 = tl.load(in_ptr3 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp18 = tl.load(in_ptr4 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp2 = tl.sigmoid(tmp1)
        tmp3 = tmp0 * tmp2
        tmp5 = 196.0
        tmp6 = tmp4 / tmp5
        tmp7 = tmp3 + tmp6
        tmp9 = tl.sigmoid(tmp8)
        tmp10 = 1.0
        tmp11 = tmp10 - tmp9
        tmp12 = tmp8 * tmp11
        tmp13 = tmp12 + tmp10
        tmp14 = tmp9 * tmp13
        tmp15 = tmp7 * tmp14
        tmp16 = tmp15.to(tl.float32)
        _tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17)
        tmp19 = tmp18.to(tl.float32)
        tmp21 = tmp19 - tmp20
        tmp22 = tmp16 * tmp21
        _tmp23 = tl.where(rmask & xmask, _tmp23 + tmp22, _tmp23)
    tmp17 = tl.sum(_tmp17, 1)[:, None]
    tl.store(out_ptr0 + x0, tmp17, xmask)
    tmp23 = tl.sum(_tmp23, 1)[:, None]
    tl.store(out_ptr1 + x0, tmp23, xmask)
    tmp24 = tl.load(in_ptr6 + (x0), xmask)
    tmp25 = tmp23 * tmp24
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp25, xmask)


def get_args():
    arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_5 = rand_strided((1, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    arg_9 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_70.run(*args, 480, 25088, grid=grid(480), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_70.benchmark_all_configs(*args, 480, 25088, grid=grid(480))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/qc/cqcavzge4fgj7thkzvjlxnonwb5xlzaboopggaaauvq4wfdtklsv.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub

# aten._native_batch_norm_legit_functional => convert_element_type_292
# aten.add => add_324, add_325
# aten.clone => clone_34
# aten.div => div_5
# aten.fill => full_like_13
# aten.mul => mul_649, mul_655, mul_656, mul_657
# aten.native_batch_norm_backward => convert_element_type_511, mul_664, sub_133, sub_135, sub_136
# aten.sigmoid => sigmoid_47, sigmoid_77
# aten.sub => sub_132
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_71 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_71(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 12042240
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x4 = (xindex // 196)
    x1 = (xindex // 196) % 480
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32)
    tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32)
    tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32)
    tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32)
    tmp19 = tl.load(in_ptr5 + (x1), None)
    tmp21 = tl.load(in_ptr6 + (x1), None)
    tmp24 = tl.load(in_ptr7 + (x1), None)
    tmp29 = tl.load(in_ptr8 + (x1), None)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp0 * tmp2
    tmp5 = 196.0
    tmp6 = tmp4 / tmp5
    tmp7 = tmp3 + tmp6
    tmp9 = tl.sigmoid(tmp8)
    tmp10 = 1.0
    tmp11 = tmp10 - tmp9
    tmp12 = tmp8 * tmp11
    tmp13 = tmp12 + tmp10
    tmp14 = tmp9 * tmp13
    tmp15 = tmp7 * tmp14
    tmp16 = tmp15.to(tl.float32)
    tmp18 = tmp17.to(tl.float32)
    tmp20 = tmp18 - tmp19
    tmp22 = 3.985969387755102e-05
    tmp23 = tmp21 * tmp22
    tmp25 = tmp24 * tmp24
    tmp26 = tmp23 * tmp25
    tmp27 = tmp20 * tmp26
    tmp28 = tmp16 - tmp27
    tmp30 = tmp29 * tmp22
    tmp31 = tmp28 - tmp30
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp31, None)


def get_args():
    arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_5 = rand_strided((1, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    arg_9 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_71.run(*args, 12042240, grid=grid(12042240), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_71.benchmark_all_configs(*args, 12042240, grid=grid(12042240))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ya/cya3odioikqnsayl73izdekuxqhs5rqfkr2pft6i3rty3tyluanc.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_40
triton_poi_fused_convolution_backward_72 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_72(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3010560
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = (xindex // 23520)
    x3 = xindex % 23520
    x1 = (xindex // 196) % 120
    x4 = xindex
    tmp0 = tl.load(in_ptr0 + (70560 + x3 + (94080*x2)), None)
    tmp1 = tl.load(in_ptr1 + (360 + x1), None)
    tmp2 = tl.load(in_ptr2 + (360 + x1), None)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp0 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)


def get_args():
    arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_72.run(*args, 3010560, grid=grid(3010560), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_72.benchmark_all_configs(*args, 3010560, grid=grid(3010560))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/fe/cfetbcwbedjsjshgd4bjuqpljqnapb5dcgp46ajfwwvj2dk7ne2i.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_514
triton_poi_fused__to_copy_73 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_73(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 9720
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_73.run(*args, 9720, grid=grid(9720), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_73.benchmark_all_configs(*args, 9720, grid=grid(9720))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/iq/ciqofepsagke43sfmdrbzwfpkfw4zvr3els5r3zwk7gigvnpcypw.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_41
triton_poi_fused_convolution_backward_74 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_74(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3010560
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = (xindex // 23520)
    x3 = xindex % 23520
    x1 = (xindex // 196) % 120
    x4 = xindex
    tmp0 = tl.load(in_ptr0 + (47040 + x3 + (94080*x2)), None)
    tmp1 = tl.load(in_ptr1 + (240 + x1), None)
    tmp2 = tl.load(in_ptr2 + (240 + x1), None)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp0 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)


def get_args():
    arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_74.run(*args, 3010560, grid=grid(3010560), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_74.benchmark_all_configs(*args, 3010560, grid=grid(3010560))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/dy/cdybcagplfulpsthbc6wmswmxelkrixxqqrrt72daichdjv2sefi.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_515
triton_poi_fused__to_copy_75 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_75(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 5880
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_75.run(*args, 5880, grid=grid(5880), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_75.benchmark_all_configs(*args, 5880, grid=grid(5880))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/uy/cuyidmj6qefewok45jcr4rfmsmcj6hczovevzzaxuc52uf6hh3nc.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_42
triton_poi_fused_convolution_backward_76 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_76(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3010560
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = (xindex // 23520)
    x3 = xindex % 23520
    x1 = (xindex // 196) % 120
    x4 = xindex
    tmp0 = tl.load(in_ptr0 + (23520 + x3 + (94080*x2)), None)
    tmp1 = tl.load(in_ptr1 + (120 + x1), None)
    tmp2 = tl.load(in_ptr2 + (120 + x1), None)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp0 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)


def get_args():
    arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_76.run(*args, 3010560, grid=grid(3010560), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_76.benchmark_all_configs(*args, 3010560, grid=grid(3010560))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ux/cuxkdni67km5gfbdihbo5i44iivskoy6hdt7gubbh5qgb5mkwsob.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_516
triton_poi_fused__to_copy_77 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_77(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3000
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_77.run(*args, 3000, grid=grid(3000), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_77.benchmark_all_configs(*args, 3000, grid=grid(3000))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/4r/c4rld6qtfki5aedwz7g3eqwmnt7w4mmzwymiotpz4adjvg43hqwk.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_43
triton_poi_fused_convolution_backward_78 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_78(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3010560
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = (xindex // 23520)
    x3 = xindex % 23520
    x1 = (xindex // 196) % 120
    x4 = xindex
    tmp0 = tl.load(in_ptr0 + (x3 + (94080*x2)), None)
    tmp1 = tl.load(in_ptr1 + (x1), None)
    tmp2 = tl.load(in_ptr2 + (x1), None)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp0 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)


def get_args():
    arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_78.run(*args, 3010560, grid=grid(3010560), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_78.benchmark_all_configs(*args, 3010560, grid=grid(3010560))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ri/crixsdzqwpepdejipqi3ixt4vlyrmrskq5hgkzn7jumpuurao5h5.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_517
triton_poi_fused__to_copy_79 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_79(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1080
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_79.run(*args, 1080, grid=grid(1080), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_79.benchmark_all_configs(*args, 1080, grid=grid(1080))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ty/ctycfmevxfpn542kh5agvjqilg3o5r77udr2h3wyqu23mqhlqdxf.py
# Original ATen: aten.cat

# aten.cat => cat_49
triton_poi_fused_cat_80 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_80(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3010560
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 23520
    x1 = (xindex // 23520)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (94080*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 120, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_80.run(*args, 3010560, grid=grid(3010560), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_80.benchmark_all_configs(*args, 3010560, grid=grid(3010560))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/7i/c7iiiqxahzgxonlzcskqwzfumjrx6cupqeinuz5pkzk66wqnnnyc.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mul, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_284
# aten.mul => mul_669
# aten.native_batch_norm_backward => convert_element_type_518, mul_670, mul_678, sub_138, sum_47, sum_48
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_81 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[512, 32768],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_81(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 480
    rnumel = 25088
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    tmp7 = tl.load(in_ptr3 + (x0), xmask)
    _tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 196
        r2 = (rindex // 196)
        tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tl.load(in_ptr1 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp5 = tl.load(in_ptr2 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp2 = tmp0 * tmp1
        tmp3 = tmp2.to(tl.float32)
        _tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4)
        tmp6 = tmp5.to(tl.float32)
        tmp8 = tmp6 - tmp7
        tmp9 = tmp3 * tmp8
        _tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10)
    tmp4 = tl.sum(_tmp4, 1)[:, None]
    tl.store(out_ptr0 + x0, tmp4, xmask)
    tmp10 = tl.sum(_tmp10, 1)[:, None]
    tl.store(out_ptr1 + x0, tmp10, xmask)
    tmp11 = tl.load(in_ptr4 + (x0), xmask)
    tmp12 = tmp10 * tmp11
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp12, xmask)


def get_args():
    arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_81.run(*args, 480, 25088, grid=grid(480), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_81.benchmark_all_configs(*args, 480, 25088, grid=grid(480))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/v4/cv4k7goz3fg6grlbcjwhdwg6ck3vmlxtj3l76l5phqjwc5cao5b3.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mul, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_284
# aten.mul => mul_669
# aten.native_batch_norm_backward => convert_element_type_518, convert_element_type_520, mul_676, mul_677, sub_138, sub_140, sub_141
triton_poi_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_82 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_82(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr):
    xnumel = 12042240
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 196) % 480
    tmp0 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32)
    tmp1 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp4 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
    tmp6 = tl.load(in_ptr2 + (x1), None)
    tmp8 = tl.load(in_ptr3 + (x1), None)
    tmp11 = tl.load(in_ptr4 + (x1), None)
    tmp16 = tl.load(in_ptr5 + (x1), None)
    tmp19 = tl.load(in_ptr6 + (x1), None)
    tmp2 = tmp0 * tmp1
    tmp3 = tmp2.to(tl.float32)
    tmp5 = tmp4.to(tl.float32)
    tmp7 = tmp5 - tmp6
    tmp9 = 3.985969387755102e-05
    tmp10 = tmp8 * tmp9
    tmp12 = tmp11 * tmp11
    tmp13 = tmp10 * tmp12
    tmp14 = tmp7 * tmp13
    tmp15 = tmp3 - tmp14
    tmp17 = tmp16 * tmp9
    tmp18 = tmp15 - tmp17
    tmp20 = tmp11 * tmp19
    tmp21 = tmp18 * tmp20
    tmp22 = tmp21.to(tl.float32)
    tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None)


def get_args():
    arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_82.run(*args, 12042240, grid=grid(12042240), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_82.benchmark_all_configs(*args, 12042240, grid=grid(12042240))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/om/comunyk6xj4yo26dm6rnf6zkadn3bzlwf2dwxosduxrcmjkwwa6u.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_521
triton_poi_fused__to_copy_83 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_83(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 19200
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_83.run(*args, 19200, grid=grid(19200), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_83.benchmark_all_configs(*args, 19200, grid=grid(19200))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/2y/c2yommjpqgw2nsvcapep2mob6welg6j7akfq7uc4xaaiuzwdalt7.py
# Original ATen: aten.cat

# aten.cat => cat_50
triton_poi_fused_cat_84 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_84(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2007040
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 15680
    x1 = (xindex // 15680)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (31360*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 80, 14, 14), (15680, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 80, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_84.run(*args, 2007040, grid=grid(2007040), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_84.benchmark_all_configs(*args, 2007040, grid=grid(2007040))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/zy/czyhduwjwdj7bjl64zkqqaz3rymv5xt54uovrwuvmbs3ylr4n3xb.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_280
# aten.add => add_327
# aten.native_batch_norm_backward => convert_element_type_523, mul_679, sub_142, sum_49, sum_50
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_85 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[1024, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: 'i32', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_85(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 640
    rnumel = 6272
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex % 160
    x1 = (xindex // 160)
    _tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    tmp7 = tl.load(in_ptr3 + (x0), xmask)
    _tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = tl.load(in_ptr0 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tl.load(in_ptr1 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp5 = tl.load(in_ptr2 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp2 = tmp0 + tmp1
        tmp3 = tmp2.to(tl.float32)
        _tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4)
        tmp6 = tmp5.to(tl.float32)
        tmp8 = tmp6 - tmp7
        tmp9 = tmp3 * tmp8
        _tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10)
    tmp4 = tl.sum(_tmp4, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp4, xmask)
    tmp10 = tl.sum(_tmp10, 1)[:, None]
    tl.store(out_ptr1 + x3, tmp10, xmask)


def get_args():
    arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_85.run(*args, 640, 6272, grid=grid(640), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_85.benchmark_all_configs(*args, 640, 6272, grid=grid(640))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/5n/c5ngqp5cp6ieu5sc7n3ylgpltjkzcel7tc4dwx3ctbhj3ayamet2.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_280
# aten.add => add_327
# aten.native_batch_norm_backward => convert_element_type_523, convert_element_type_525, mul_685, mul_686, sub_142, sub_144, sub_145
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_86 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp16', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_86(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 4014080
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 196) % 160
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
    tmp4 = tl.load(in_ptr2 + (x3), None).to(tl.float32)
    tmp6 = tl.load(in_ptr3 + (x1), None)
    tmp8 = tl.load(in_ptr4 + (x1), None)
    tmp11 = tl.load(in_ptr5 + (x1), None)
    tmp16 = tl.load(in_ptr6 + (x1), None)
    tmp19 = tl.load(in_ptr7 + (x1), None)
    tmp2 = tmp0 + tmp1
    tmp3 = tmp2.to(tl.float32)
    tmp5 = tmp4.to(tl.float32)
    tmp7 = tmp5 - tmp6
    tmp9 = 3.985969387755102e-05
    tmp10 = tmp8 * tmp9
    tmp12 = tmp11 * tmp11
    tmp13 = tmp10 * tmp12
    tmp14 = tmp7 * tmp13
    tmp15 = tmp3 - tmp14
    tmp17 = tmp16 * tmp9
    tmp18 = tmp15 - tmp17
    tmp20 = tmp11 * tmp19
    tmp21 = tmp18 * tmp20
    tmp22 = tmp21.to(tl.float32)
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None)


def get_args():
    arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_86.run(*args, 4014080, grid=grid(4014080), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_86.benchmark_all_configs(*args, 4014080, grid=grid(4014080))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/mi/cmi2g34rcnn4o7hkiq27yemv23mvhmf7ilu2wl6n5igdib4lbn3v.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_256
# aten.add => add_327, add_332
# aten.native_batch_norm_backward => convert_element_type_547, mul_719, sub_158, sum_58, sum_59
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_87 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[1024, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_87(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 640
    rnumel = 6272
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex % 160
    x1 = (xindex // 160)
    _tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    tmp9 = tl.load(in_ptr4 + (x0), xmask)
    _tmp12 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = tl.load(in_ptr0 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tl.load(in_ptr1 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp3 = tl.load(in_ptr2 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp7 = tl.load(in_ptr3 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp2 = tmp0 + tmp1
        tmp4 = tmp2 + tmp3
        tmp5 = tmp4.to(tl.float32)
        _tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6)
        tmp8 = tmp7.to(tl.float32)
        tmp10 = tmp8 - tmp9
        tmp11 = tmp5 * tmp10
        _tmp12 = tl.where(rmask & xmask, _tmp12 + tmp11, _tmp12)
    tmp6 = tl.sum(_tmp6, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp6, xmask)
    tmp12 = tl.sum(_tmp12, 1)[:, None]
    tl.store(out_ptr1 + x3, tmp12, xmask)


def get_args():
    arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_87.run(*args, 640, 6272, grid=grid(640), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_87.benchmark_all_configs(*args, 640, 6272, grid=grid(640))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/5m/c5msj6p2vdag4fs3dze5gd2kastkwiprxfqobb4ipbu3zwk6y4tb.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_256
# aten.add => add_327, add_332
# aten.native_batch_norm_backward => convert_element_type_547, mul_725, mul_726, sub_158, sub_160, sub_161
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_88 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_88(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 4014080
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 196) % 160
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
    tmp3 = tl.load(in_ptr2 + (x3), None).to(tl.float32)
    tmp6 = tl.load(in_ptr3 + (x3), None).to(tl.float32)
    tmp8 = tl.load(in_ptr4 + (x1), None)
    tmp10 = tl.load(in_ptr5 + (x1), None)
    tmp13 = tl.load(in_ptr6 + (x1), None)
    tmp18 = tl.load(in_ptr7 + (x1), None)
    tmp21 = tl.load(in_ptr8 + (x1), None)
    tmp2 = tmp0 + tmp1
    tmp4 = tmp2 + tmp3
    tmp5 = tmp4.to(tl.float32)
    tmp7 = tmp6.to(tl.float32)
    tmp9 = tmp7 - tmp8
    tmp11 = 3.985969387755102e-05
    tmp12 = tmp10 * tmp11
    tmp14 = tmp13 * tmp13
    tmp15 = tmp12 * tmp14
    tmp16 = tmp9 * tmp15
    tmp17 = tmp5 - tmp16
    tmp19 = tmp18 * tmp11
    tmp20 = tmp17 - tmp19
    tmp22 = tmp13 * tmp21
    tmp23 = tmp20 * tmp22
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp23, None)


def get_args():
    arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
    arg_9 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_88.run(*args, 4014080, grid=grid(4014080), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_88.benchmark_all_configs(*args, 4014080, grid=grid(4014080))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/33/c33mk7jbf3hbussw7bzc7k6jlfrdr6q6k2jcx5wwhkb43usq3f3k.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_56
triton_poi_fused_convolution_backward_89 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_89(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2007040
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 15680
    x1 = (xindex // 15680)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (15680 + x0 + (31360*x1)), None)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, None)


def get_args():
    arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((128, 80, 14, 14), (15680, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_89.run(*args, 2007040, grid=grid(2007040), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_89.benchmark_all_configs(*args, 2007040, grid=grid(2007040))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/qs/cqsgmcz5zjvb6ko7kxhkl5io7i4gmvcpy5zxwjwu7qjx5oya6hd3.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_57
triton_poi_fused_convolution_backward_90 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_90(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2007040
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 15680
    x1 = (xindex // 15680)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (31360*x1)), None)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, None)


def get_args():
    arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((128, 80, 14, 14), (15680, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_90.run(*args, 2007040, grid=grid(2007040), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_90.benchmark_all_configs(*args, 2007040, grid=grid(2007040))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/is/cis35c7i52ytm3zlhduqhkylrq33chwilt3nh2nr2nfagpbn72d4.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_232
# aten.add => add_327, add_332, add_337
# aten.native_batch_norm_backward => convert_element_type_571, mul_759, sub_174, sum_67, sum_68
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_91 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[1024, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_91(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 640
    rnumel = 6272
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex % 160
    x1 = (xindex // 160)
    _tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    tmp11 = tl.load(in_ptr5 + (x0), xmask)
    _tmp14 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = tl.load(in_ptr0 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tl.load(in_ptr1 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp3 = tl.load(in_ptr2 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp5 = tl.load(in_ptr3 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp9 = tl.load(in_ptr4 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp2 = tmp0 + tmp1
        tmp4 = tmp2 + tmp3
        tmp6 = tmp4 + tmp5
        tmp7 = tmp6.to(tl.float32)
        _tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8)
        tmp10 = tmp9.to(tl.float32)
        tmp12 = tmp10 - tmp11
        tmp13 = tmp7 * tmp12
        _tmp14 = tl.where(rmask & xmask, _tmp14 + tmp13, _tmp14)
    tmp8 = tl.sum(_tmp8, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp8, xmask)
    tmp14 = tl.sum(_tmp14, 1)[:, None]
    tl.store(out_ptr1 + x3, tmp14, xmask)


def get_args():
    arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_5 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((160, 4), (1, 160), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_91.run(*args, 640, 6272, grid=grid(640), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_91.benchmark_all_configs(*args, 640, 6272, grid=grid(640))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/lj/cljkpj67jhpiomhay7l2fuqfmjz2fox7ndvjbixox242vrhfypjf.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.convolution_backward, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_232
# aten.add => add_327, add_332, add_337
# aten.convolution_backward => convolution_backward_66
# aten.native_batch_norm_backward => convert_element_type_571, convert_element_type_573, mul_765, mul_766, sub_174, sub_176, sub_177
triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_92 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: '*fp16', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_92(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, in_ptr9, out_ptr1, xnumel, XBLOCK : tl.constexpr):
    xnumel = 4014080
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 196) % 160
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
    tmp3 = tl.load(in_ptr2 + (x3), None).to(tl.float32)
    tmp5 = tl.load(in_ptr3 + (x3), None).to(tl.float32)
    tmp8 = tl.load(in_ptr4 + (x3), None).to(tl.float32)
    tmp10 = tl.load(in_ptr5 + (x1), None)
    tmp12 = tl.load(in_ptr6 + (x1), None)
    tmp15 = tl.load(in_ptr7 + (x1), None)
    tmp20 = tl.load(in_ptr8 + (x1), None)
    tmp23 = tl.load(in_ptr9 + (x1), None)
    tmp2 = tmp0 + tmp1
    tmp4 = tmp2 + tmp3
    tmp6 = tmp4 + tmp5
    tmp7 = tmp6.to(tl.float32)
    tmp9 = tmp8.to(tl.float32)
    tmp11 = tmp9 - tmp10
    tmp13 = 3.985969387755102e-05
    tmp14 = tmp12 * tmp13
    tmp16 = tmp15 * tmp15
    tmp17 = tmp14 * tmp16
    tmp18 = tmp11 * tmp17
    tmp19 = tmp7 - tmp18
    tmp21 = tmp20 * tmp13
    tmp22 = tmp19 - tmp21
    tmp24 = tmp15 * tmp23
    tmp25 = tmp22 * tmp24
    tmp26 = tmp25.to(tl.float32)
    tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp26, None)


def get_args():
    arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_5 = rand_strided((1, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
    arg_9 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
    arg_10 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, arg_10,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_92.run(*args, 4014080, grid=grid(4014080), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_92.benchmark_all_configs(*args, 4014080, grid=grid(4014080))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/af/cafjjp3a3kwtimi2i4kv3zpwxjdfuxdqkq7szn2vypz2ywexppw7.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_574
triton_poi_fused__to_copy_93 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_93(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 99840
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((160, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((160, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_93.run(*args, 99840, grid=grid(99840), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_93.benchmark_all_configs(*args, 99840, grid=grid(99840))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ic/cicgaxe2oqufzttvfjscmbopri2uxa3mejzwrav4jbahpmbb4rby.py
# Original ATen: aten.mul, aten.sigmoid, aten.sigmoid_backward, aten.silu, aten.sum

# aten.mul => mul_768
# aten.sigmoid => sigmoid_35
# aten.sigmoid_backward => convert_element_type_575, convert_element_type_576, convert_element_type_577, mul_770, mul_771, sub_178
# aten.silu => convert_element_type_223, convert_element_type_224, mul_278, sigmoid_33
# aten.sum => sum_69
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_94 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[131072, 256],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_94(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 79872
    rnumel = 196
    RBLOCK: tl.constexpr = 256
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (r1 + (196*x0)), rmask, other=0).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (r1 + (196*x0)), rmask, other=0).to(tl.float32)
    tmp11 = tl.load(in_ptr2 + (x0), None).to(tl.float32)
    tmp2 = tmp1.to(tl.float32)
    tmp3 = tl.sigmoid(tmp2)
    tmp4 = tmp2 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tmp6 = tmp0 * tmp5
    tmp8 = tl.where(rmask, tmp6, 0)
    tmp9 = tl.sum(tmp8, 1)[:, None]
    tmp10 = tmp9.to(tl.float32)
    tmp12 = tl.sigmoid(tmp11)
    tmp13 = tmp12.to(tl.float32)
    tmp14 = 1.0
    tmp15 = tmp14 - tmp13
    tmp16 = tmp13 * tmp15
    tmp17 = tmp10 * tmp16
    tmp18 = tmp17.to(tl.float32)
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp18, None)


def get_args():
    arg_0 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_94.run(*args, 79872, 196, grid=grid(79872), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_94.benchmark_all_configs(*args, 79872, 196, grid=grid(79872))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/sw/cswrgzqc55nc76ttfypz7ruvqwdkjksusjytknavxbcop6g2otp2.py
# Original ATen: aten._to_copy, aten.convolution_backward

# aten._to_copy => convert_element_type_579
# aten.convolution_backward => sum_70
triton_per_fused__to_copy_convolution_backward_95 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[1024, 128],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__to_copy_convolution_backward_95(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 624
    rnumel = 128
    RBLOCK: tl.constexpr = 128
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (624*r1)), rmask & xmask, other=0).to(tl.float32)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)


def get_args():
    arg_0 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__to_copy_convolution_backward_95.run(*args, 624, 128, grid=grid(624), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__to_copy_convolution_backward_95.benchmark_all_configs(*args, 624, 128, grid=grid(624))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/rk/crkgwzo3s6rzbbnalsvhwyanngv4n2zgzifhbhpcnc5ysqffl6ac.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_578
triton_poi_fused__to_copy_96 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_96(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 32448
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((624, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((624, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_96.run(*args, 32448, grid=grid(32448), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_96.benchmark_all_configs(*args, 32448, grid=grid(32448))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/wf/cwfpzmbbmgs237i4rn23twa6vqls763iyjrp76njdwwsj3et2uaf.py
# Original ATen: aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub

# aten.add => add_338
# aten.clone => clone_26
# aten.fill => full_like_21
# aten.mul => mul_772, mul_773, mul_774
# aten.sigmoid => sigmoid_85
# aten.sub => sub_179
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_97 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_add_clone_fill_mul_sigmoid_sub_97(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 6656
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = 1.0
    tmp4 = tmp3 - tmp2
    tmp5 = tmp1 * tmp4
    tmp6 = tmp5 + tmp3
    tmp7 = tmp2 * tmp6
    tmp8 = tmp0 * tmp7
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp8, xmask)


def get_args():
    arg_0 = rand_strided((128, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_add_clone_fill_mul_sigmoid_sub_97.run(*args, 6656, grid=grid(6656), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_add_clone_fill_mul_sigmoid_sub_97.benchmark_all_configs(*args, 6656, grid=grid(6656))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/xt/cxt55nhjkmnbxqvfatybarolu6uxihwoo3cct34gda356pn7scly.py
# Original ATen: aten._to_copy, aten.convolution_backward

# aten._to_copy => convert_element_type_581
# aten.convolution_backward => sum_71
triton_per_fused__to_copy_convolution_backward_98 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[64, 128],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__to_copy_convolution_backward_98(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 52
    rnumel = 128
    RBLOCK: tl.constexpr = 128
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (52*r1)), rmask & xmask, other=0).to(tl.float32)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)


def get_args():
    arg_0 = rand_strided((128, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((52,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__to_copy_convolution_backward_98.run(*args, 52, 128, grid=grid(52), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__to_copy_convolution_backward_98.benchmark_all_configs(*args, 52, 128, grid=grid(52))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/tw/ctwfvbwslvyki3iv4byjhnpgwt4ujawez7njntlfraulnuse3ddu.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_580
triton_poi_fused__to_copy_99 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_99(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 32448
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((52, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((52, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_99.run(*args, 32448, grid=grid(32448), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_99.benchmark_all_configs(*args, 32448, grid=grid(32448))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/6w/c6wzolzrfmxblwjxnpxkgqezmro2wot3nkz62hajpunjmpkn6enk.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub

# aten._native_batch_norm_legit_functional => convert_element_type_221
# aten.add => add_339, add_340
# aten.clone => clone_25
# aten.div => div_8
# aten.fill => full_like_22
# aten.mul => mul_769, mul_775, mul_776, mul_777
# aten.native_batch_norm_backward => convert_element_type_582, mul_778, mul_786, sub_181, sum_72, sum_73
# aten.sigmoid => sigmoid_35, sigmoid_86
# aten.sub => sub_180
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_100 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[1024, 32768],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_100(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 624
    rnumel = 25088
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    tmp20 = tl.load(in_ptr5 + (x0), xmask)
    _tmp23 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 196
        r2 = (rindex // 196)
        tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tl.load(in_ptr1 + (x0 + (624*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp4 = tl.load(in_ptr2 + (x0 + (624*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp8 = tl.load(in_ptr3 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp18 = tl.load(in_ptr4 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp2 = tl.sigmoid(tmp1)
        tmp3 = tmp0 * tmp2
        tmp5 = 196.0
        tmp6 = tmp4 / tmp5
        tmp7 = tmp3 + tmp6
        tmp9 = tl.sigmoid(tmp8)
        tmp10 = 1.0
        tmp11 = tmp10 - tmp9
        tmp12 = tmp8 * tmp11
        tmp13 = tmp12 + tmp10
        tmp14 = tmp9 * tmp13
        tmp15 = tmp7 * tmp14
        tmp16 = tmp15.to(tl.float32)
        _tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17)
        tmp19 = tmp18.to(tl.float32)
        tmp21 = tmp19 - tmp20
        tmp22 = tmp16 * tmp21
        _tmp23 = tl.where(rmask & xmask, _tmp23 + tmp22, _tmp23)
    tmp17 = tl.sum(_tmp17, 1)[:, None]
    tl.store(out_ptr0 + x0, tmp17, xmask)
    tmp23 = tl.sum(_tmp23, 1)[:, None]
    tl.store(out_ptr1 + x0, tmp23, xmask)
    tmp24 = tl.load(in_ptr6 + (x0), xmask)
    tmp25 = tmp23 * tmp24
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp25, xmask)


def get_args():
    arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_5 = rand_strided((1, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_9 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_100.run(*args, 624, 25088, grid=grid(624), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_100.benchmark_all_configs(*args, 624, 25088, grid=grid(624))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/d3/cd3muetrknv5a272355llgd7oce4qcl4nznkklo4quwqquswhe2k.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.convolution_backward, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub

# aten._native_batch_norm_legit_functional => convert_element_type_221
# aten.add => add_339, add_340
# aten.clone => clone_25
# aten.convolution_backward => convolution_backward_69
# aten.div => div_8
# aten.fill => full_like_22
# aten.mul => mul_769, mul_775, mul_776, mul_777
# aten.native_batch_norm_backward => convert_element_type_582, convert_element_type_584, mul_784, mul_785, sub_181, sub_183, sub_184
# aten.sigmoid => sigmoid_35, sigmoid_86
# aten.sub => sub_180
triton_poi_fused__native_batch_norm_legit_functional_add_clone_convolution_backward_div_fill_mul_native_batch_norm_backward_sigmoid_sub_101 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: '*fp16', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_convolution_backward_div_fill_mul_native_batch_norm_backward_sigmoid_sub_101(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, in_ptr9, out_ptr1, xnumel, XBLOCK : tl.constexpr):
    xnumel = 15654912
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x4 = (xindex // 196)
    x1 = (xindex // 196) % 624
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32)
    tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32)
    tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32)
    tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32)
    tmp19 = tl.load(in_ptr5 + (x1), None)
    tmp21 = tl.load(in_ptr6 + (x1), None)
    tmp24 = tl.load(in_ptr7 + (x1), None)
    tmp29 = tl.load(in_ptr8 + (x1), None)
    tmp32 = tl.load(in_ptr9 + (x1), None)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp0 * tmp2
    tmp5 = 196.0
    tmp6 = tmp4 / tmp5
    tmp7 = tmp3 + tmp6
    tmp9 = tl.sigmoid(tmp8)
    tmp10 = 1.0
    tmp11 = tmp10 - tmp9
    tmp12 = tmp8 * tmp11
    tmp13 = tmp12 + tmp10
    tmp14 = tmp9 * tmp13
    tmp15 = tmp7 * tmp14
    tmp16 = tmp15.to(tl.float32)
    tmp18 = tmp17.to(tl.float32)
    tmp20 = tmp18 - tmp19
    tmp22 = 3.985969387755102e-05
    tmp23 = tmp21 * tmp22
    tmp25 = tmp24 * tmp24
    tmp26 = tmp23 * tmp25
    tmp27 = tmp20 * tmp26
    tmp28 = tmp16 - tmp27
    tmp30 = tmp29 * tmp22
    tmp31 = tmp28 - tmp30
    tmp33 = tmp24 * tmp32
    tmp34 = tmp31 * tmp33
    tmp35 = tmp34.to(tl.float32)
    tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp35, None)


def get_args():
    arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_5 = rand_strided((1, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_9 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_10 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, arg_10,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_convolution_backward_div_fill_mul_native_batch_norm_backward_sigmoid_sub_101.run(*args, 15654912, grid=grid(15654912), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_clone_convolution_backward_div_fill_mul_native_batch_norm_backward_sigmoid_sub_101.benchmark_all_configs(*args, 15654912, grid=grid(15654912))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/23/c23cf4zcm2zsqnix7cwevl5uqnq6e7k4nrleu5hxvvucflqj7ztk.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_585
triton_poi_fused__to_copy_102 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_102(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 5616
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((624, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((624, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_102.run(*args, 5616, grid=grid(5616), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_102.benchmark_all_configs(*args, 5616, grid=grid(5616))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/xe/cxe5wowu7imk523mjj4gviz3cjwfkjjg7yenewa23cv4xlyi265g.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mul, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_216
# aten.mul => mul_789
# aten.native_batch_norm_backward => convert_element_type_586, mul_790, mul_798, sub_186, sum_74, sum_75
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_103 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[1024, 32768],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_103(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 624
    rnumel = 25088
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    tmp7 = tl.load(in_ptr3 + (x0), xmask)
    _tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 196
        r2 = (rindex // 196)
        tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tl.load(in_ptr1 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp5 = tl.load(in_ptr2 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp2 = tmp0 * tmp1
        tmp3 = tmp2.to(tl.float32)
        _tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4)
        tmp6 = tmp5.to(tl.float32)
        tmp8 = tmp6 - tmp7
        tmp9 = tmp3 * tmp8
        _tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10)
    tmp4 = tl.sum(_tmp4, 1)[:, None]
    tl.store(out_ptr0 + x0, tmp4, xmask)
    tmp10 = tl.sum(_tmp10, 1)[:, None]
    tl.store(out_ptr1 + x0, tmp10, xmask)
    tmp11 = tl.load(in_ptr4 + (x0), xmask)
    tmp12 = tmp10 * tmp11
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp12, xmask)


def get_args():
    arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_103.run(*args, 624, 25088, grid=grid(624), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_103.benchmark_all_configs(*args, 624, 25088, grid=grid(624))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/n6/cn6vgxz3swb2xjokxek7wjtmjhdz6hdzplejuatnteeyvplenl3a.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.convolution_backward, aten.mul, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_216
# aten.convolution_backward => convolution_backward_70
# aten.mul => mul_789
# aten.native_batch_norm_backward => convert_element_type_586, convert_element_type_588, mul_796, mul_797, sub_186, sub_188, sub_189
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_104 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_104(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr):
    xnumel = 15654912
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 196) % 624
    tmp0 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32)
    tmp1 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp4 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
    tmp6 = tl.load(in_ptr2 + (x1), None)
    tmp8 = tl.load(in_ptr3 + (x1), None)
    tmp11 = tl.load(in_ptr4 + (x1), None)
    tmp16 = tl.load(in_ptr5 + (x1), None)
    tmp19 = tl.load(in_ptr6 + (x1), None)
    tmp2 = tmp0 * tmp1
    tmp3 = tmp2.to(tl.float32)
    tmp5 = tmp4.to(tl.float32)
    tmp7 = tmp5 - tmp6
    tmp9 = 3.985969387755102e-05
    tmp10 = tmp8 * tmp9
    tmp12 = tmp11 * tmp11
    tmp13 = tmp10 * tmp12
    tmp14 = tmp7 * tmp13
    tmp15 = tmp3 - tmp14
    tmp17 = tmp16 * tmp9
    tmp18 = tmp15 - tmp17
    tmp20 = tmp11 * tmp19
    tmp21 = tmp18 * tmp20
    tmp22 = tmp21.to(tl.float32)
    tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None)


def get_args():
    arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_104.run(*args, 15654912, grid=grid(15654912), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_104.benchmark_all_configs(*args, 15654912, grid=grid(15654912))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/iv/civ2oft43774ffyg3v5kh23rmteh5qvbuafa2jrfla6exeuvk3sq.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_589
triton_poi_fused__to_copy_105 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_105(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 64896
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((624, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((624, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_105.run(*args, 64896, grid=grid(64896), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_105.benchmark_all_configs(*args, 64896, grid=grid(64896))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/j7/cj7ui4cmg7zcaen4xj4vc2j54wf3j5vlszxmshcee37dzcpqxdcd.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_213
# aten.native_batch_norm_backward => convert_element_type_590, mul_799, sub_190, sum_76, sum_77
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_106 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[512, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_106(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 416
    rnumel = 6272
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex % 104
    x1 = (xindex // 104)
    _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    tmp5 = tl.load(in_ptr2 + (x0), xmask)
    _tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = tl.load(in_ptr0 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp3 = tl.load(in_ptr1 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
        tmp4 = tmp3.to(tl.float32)
        tmp6 = tmp4 - tmp5
        tmp7 = tmp1 * tmp6
        _tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8)
    tmp2 = tl.sum(_tmp2, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp2, xmask)
    tmp8 = tl.sum(_tmp8, 1)[:, None]
    tl.store(out_ptr1 + x3, tmp8, xmask)


def get_args():
    arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_106.run(*args, 416, 6272, grid=grid(416), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_106.benchmark_all_configs(*args, 416, 6272, grid=grid(416))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/at/catlmgmveca6eegfxgggc25rzm3umawxz52w4d3665iwc5nkqdhv.py
# Original ATen: aten.native_batch_norm_backward

# aten.native_batch_norm_backward => convert_element_type_590, sum_76
triton_per_fused_native_batch_norm_backward_107 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[128, 4],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}
)
@triton.jit
def triton_per_fused_native_batch_norm_backward_107(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 104
    rnumel = 4
    RBLOCK: tl.constexpr = 4
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (104*r1)), rmask & xmask, other=0)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tl.store(out_ptr0 + x0, tmp3, xmask)


def get_args():
    arg_0 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused_native_batch_norm_backward_107.run(*args, 104, 4, grid=grid(104), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused_native_batch_norm_backward_107.benchmark_all_configs(*args, 104, 4, grid=grid(104))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/sw/cswvpkxc2jcvqnphnzcluqix5sdpoevu3y2llapfjncqthtvmuif.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_213
# aten.native_batch_norm_backward => convert_element_type_590, mul_799, mul_807, sub_190, sum_77
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_108 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[128, 4],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_108(in_ptr0, in_ptr1, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 104
    rnumel = 4
    RBLOCK: tl.constexpr = 4
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (104*r1)), rmask & xmask, other=0)
    tmp4 = tl.load(in_ptr1 + (x0), xmask)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp5 = tmp3 * tmp4
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
    tl.store(out_ptr0 + x0, tmp3, xmask)


def get_args():
    arg_0 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_108.run(*args, 104, 4, grid=grid(104), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_108.benchmark_all_configs(*args, 104, 4, grid=grid(104))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/6d/c6dicjfqwciclb5imlva4adfrsslgvcv4kzchyl4zcmedltgvjfk.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_213
# aten.native_batch_norm_backward => convert_element_type_590, convert_element_type_592, mul_805, mul_806, sub_190, sub_192, sub_193
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_109 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp16', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_109(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2609152
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 196) % 104
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
    tmp4 = tl.load(in_ptr2 + (x1), None)
    tmp6 = tl.load(in_ptr3 + (x1), None)
    tmp9 = tl.load(in_ptr4 + (x1), None)
    tmp14 = tl.load(in_ptr5 + (x1), None)
    tmp17 = tl.load(in_ptr6 + (x1), None)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp2.to(tl.float32)
    tmp5 = tmp3 - tmp4
    tmp7 = 3.985969387755102e-05
    tmp8 = tmp6 * tmp7
    tmp10 = tmp9 * tmp9
    tmp11 = tmp8 * tmp10
    tmp12 = tmp5 * tmp11
    tmp13 = tmp1 - tmp12
    tmp15 = tmp14 * tmp7
    tmp16 = tmp13 - tmp15
    tmp18 = tmp9 * tmp17
    tmp19 = tmp16 * tmp18
    tmp20 = tmp19.to(tl.float32)
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp20, None)


def get_args():
    arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_109.run(*args, 2609152, grid=grid(2609152), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_109.benchmark_all_configs(*args, 2609152, grid=grid(2609152))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/u6/cu6gj4lj3w4o4rbwgegqtcfzcryyabv5bfn4fymj6h576kucjrrk.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_593
triton_poi_fused__to_copy_110 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_110(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 16224
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_110.run(*args, 16224, grid=grid(16224), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_110.benchmark_all_configs(*args, 16224, grid=grid(16224))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/cq/ccq5bqvz5gysfuffc2onzn5q2itdz5cceokvaxh3r7x3536x7ffr.py
# Original ATen: aten.cat

# aten.cat => cat_57
triton_poi_fused_cat_111 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_111(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 7827456
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 61152
    x1 = (xindex // 61152)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (122304*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 312, 14, 14), (61152, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 312, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_111.run(*args, 7827456, grid=grid(7827456), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_111.benchmark_all_configs(*args, 7827456, grid=grid(7827456))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/wo/cwolgc4dw6izbizjkqhmxp5s3e2v7ufjibf3admpysa3tcfcvbwi.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_598
triton_poi_fused__to_copy_112 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_112(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 16224
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((624, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((624, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_112.run(*args, 16224, grid=grid(16224), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_112.benchmark_all_configs(*args, 16224, grid=grid(16224))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/zs/czsqkpgzlfpo7vf22txwb4vktp6li35xdtdplz23vaooelx52mgz.py
# Original ATen: aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub

# aten.add => add_342
# aten.clone => clone_23
# aten.fill => full_like_24
# aten.mul => mul_812, mul_813, mul_814
# aten.sigmoid => sigmoid_88
# aten.sub => sub_195
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_113 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_add_clone_fill_mul_sigmoid_sub_113(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3328
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = 1.0
    tmp4 = tmp3 - tmp2
    tmp5 = tmp1 * tmp4
    tmp6 = tmp5 + tmp3
    tmp7 = tmp2 * tmp6
    tmp8 = tmp0 * tmp7
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp8, xmask)


def get_args():
    arg_0 = rand_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_add_clone_fill_mul_sigmoid_sub_113.run(*args, 3328, grid=grid(3328), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_add_clone_fill_mul_sigmoid_sub_113.benchmark_all_configs(*args, 3328, grid=grid(3328))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/br/cbrdby4wbj6lkywp7ihnm6qs7bmx7qo2e45i7oxtqq4xbl3oog4o.py
# Original ATen: aten._to_copy, aten.convolution_backward

# aten._to_copy => convert_element_type_601
# aten.convolution_backward => sum_80
triton_per_fused__to_copy_convolution_backward_114 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[32, 128],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__to_copy_convolution_backward_114(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 26
    rnumel = 128
    RBLOCK: tl.constexpr = 128
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (26*r1)), rmask & xmask, other=0).to(tl.float32)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)


def get_args():
    arg_0 = rand_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((26,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__to_copy_convolution_backward_114.run(*args, 26, 128, grid=grid(26), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__to_copy_convolution_backward_114.benchmark_all_configs(*args, 26, 128, grid=grid(26))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/4u/c4uahgnz2ykwhxsj7bueioargms6lhe32ffh6vzmxhridx2iurdm.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_600
triton_poi_fused__to_copy_115 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_115(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 16224
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((26, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((26, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_115.run(*args, 16224, grid=grid(16224), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_115.benchmark_all_configs(*args, 16224, grid=grid(16224))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ke/cke6ikrfgpb6sjssa2vwqbbb346zboallywrrkyxxcehakhefz5y.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub

# aten._native_batch_norm_legit_functional => convert_element_type_201
# aten.add => add_343, add_344
# aten.clone => clone_22
# aten.div => div_9
# aten.fill => full_like_25
# aten.mul => mul_809, mul_815, mul_816, mul_817
# aten.native_batch_norm_backward => convert_element_type_602, mul_824, sub_197, sub_199, sub_200
# aten.sigmoid => sigmoid_31, sigmoid_89
# aten.sub => sub_196
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_116 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_116(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 15654912
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x4 = (xindex // 196)
    x1 = (xindex // 196) % 624
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32)
    tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32)
    tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32)
    tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32)
    tmp19 = tl.load(in_ptr5 + (x1), None)
    tmp21 = tl.load(in_ptr6 + (x1), None)
    tmp24 = tl.load(in_ptr7 + (x1), None)
    tmp29 = tl.load(in_ptr8 + (x1), None)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp0 * tmp2
    tmp5 = 196.0
    tmp6 = tmp4 / tmp5
    tmp7 = tmp3 + tmp6
    tmp9 = tl.sigmoid(tmp8)
    tmp10 = 1.0
    tmp11 = tmp10 - tmp9
    tmp12 = tmp8 * tmp11
    tmp13 = tmp12 + tmp10
    tmp14 = tmp9 * tmp13
    tmp15 = tmp7 * tmp14
    tmp16 = tmp15.to(tl.float32)
    tmp18 = tmp17.to(tl.float32)
    tmp20 = tmp18 - tmp19
    tmp22 = 3.985969387755102e-05
    tmp23 = tmp21 * tmp22
    tmp25 = tmp24 * tmp24
    tmp26 = tmp23 * tmp25
    tmp27 = tmp20 * tmp26
    tmp28 = tmp16 - tmp27
    tmp30 = tmp29 * tmp22
    tmp31 = tmp28 - tmp30
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp31, None)


def get_args():
    arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_5 = rand_strided((1, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_9 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_116.run(*args, 15654912, grid=grid(15654912), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_116.benchmark_all_configs(*args, 15654912, grid=grid(15654912))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/v7/cv7sl67sz2xaf4l6bv2yby2sj22h2574hqzu7lld6chl75bgzok7.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_75
triton_poi_fused_convolution_backward_117 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_117(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3913728
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = (xindex // 30576)
    x3 = xindex % 30576
    x1 = (xindex // 196) % 156
    x4 = xindex
    tmp0 = tl.load(in_ptr0 + (91728 + x3 + (122304*x2)), None)
    tmp1 = tl.load(in_ptr1 + (468 + x1), None)
    tmp2 = tl.load(in_ptr2 + (468 + x1), None)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp0 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)


def get_args():
    arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_117.run(*args, 3913728, grid=grid(3913728), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_117.benchmark_all_configs(*args, 3913728, grid=grid(3913728))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/sn/csnxk2aehntwbtuxvtyjfxxx5pcnnxto57kxv3m57bgn74rxyzdo.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_605
triton_poi_fused__to_copy_118 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_118(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 12636
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((156, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((156, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_118.run(*args, 12636, grid=grid(12636), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_118.benchmark_all_configs(*args, 12636, grid=grid(12636))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/b2/cb2zxjnunwnxsrlbgr5rnqihgpyuvgs44v74hreykirhvteg3imr.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_76
triton_poi_fused_convolution_backward_119 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_119(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3913728
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = (xindex // 30576)
    x3 = xindex % 30576
    x1 = (xindex // 196) % 156
    x4 = xindex
    tmp0 = tl.load(in_ptr0 + (61152 + x3 + (122304*x2)), None)
    tmp1 = tl.load(in_ptr1 + (312 + x1), None)
    tmp2 = tl.load(in_ptr2 + (312 + x1), None)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp0 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)


def get_args():
    arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_119.run(*args, 3913728, grid=grid(3913728), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_119.benchmark_all_configs(*args, 3913728, grid=grid(3913728))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/mq/cmqf6tn342yiwkdctvgld7rocxgxgv3azqztra5gqfjonbr5lfkj.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_606
triton_poi_fused__to_copy_120 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_120(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 7644
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((156, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((156, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_120.run(*args, 7644, grid=grid(7644), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_120.benchmark_all_configs(*args, 7644, grid=grid(7644))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/c6/cc6x33tgwrw2tamqam7mfhx5ek6rlk7b3nnl7psxxxxvdr23nguf.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_77
triton_poi_fused_convolution_backward_121 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_121(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3913728
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = (xindex // 30576)
    x3 = xindex % 30576
    x1 = (xindex // 196) % 156
    x4 = xindex
    tmp0 = tl.load(in_ptr0 + (30576 + x3 + (122304*x2)), None)
    tmp1 = tl.load(in_ptr1 + (156 + x1), None)
    tmp2 = tl.load(in_ptr2 + (156 + x1), None)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp0 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)


def get_args():
    arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_121.run(*args, 3913728, grid=grid(3913728), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_121.benchmark_all_configs(*args, 3913728, grid=grid(3913728))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/bo/cbod67uuatalyrpqcdu3oy7sbucan6ecl2ia4zk35wtdojhhxonj.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_607
triton_poi_fused__to_copy_122 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_122(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3900
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((156, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((156, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_122.run(*args, 3900, grid=grid(3900), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_122.benchmark_all_configs(*args, 3900, grid=grid(3900))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/rc/crclip5jefet3ymndwzega4qujbberu5gvtz4mjx4deqjbqd4sg3.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_78
triton_poi_fused_convolution_backward_123 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_123(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3913728
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = (xindex // 30576)
    x3 = xindex % 30576
    x1 = (xindex // 196) % 156
    x4 = xindex
    tmp0 = tl.load(in_ptr0 + (x3 + (122304*x2)), None)
    tmp1 = tl.load(in_ptr1 + (x1), None)
    tmp2 = tl.load(in_ptr2 + (x1), None)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp0 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)


def get_args():
    arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_123.run(*args, 3913728, grid=grid(3913728), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_123.benchmark_all_configs(*args, 3913728, grid=grid(3913728))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/eu/ceukfvbcwohfoz2lavfjfb2ycp454er54mrt4pq2hnohxkqujrr3.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_608
triton_poi_fused__to_copy_124 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_124(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1404
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((156, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((156, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_124.run(*args, 1404, grid=grid(1404), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_124.benchmark_all_configs(*args, 1404, grid=grid(1404))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/mj/cmjohiwxqtsa56w5lbbo3aubkq7zy3joom3fttzn7hijt3v2ims3.py
# Original ATen: aten.cat

# aten.cat => cat_58
triton_poi_fused_cat_125 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_125(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3913728
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 30576
    x1 = (xindex // 30576)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (122304*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 156, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_125.run(*args, 3913728, grid=grid(3913728), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_125.benchmark_all_configs(*args, 3913728, grid=grid(3913728))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/rn/crn2e5wpsvghyj2fewhvokcjgyf5y4tcls4p5a3zgbxt3jo63qp5.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_612
triton_poi_fused__to_copy_126 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_126(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 16224
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_126.run(*args, 16224, grid=grid(16224), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_126.benchmark_all_configs(*args, 16224, grid=grid(16224))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/if/cifaxtccprphcf3f7mkgsgusdq7va54imzb4ozr5t5rqz26r4xz3.py
# Original ATen: aten.cat

# aten.cat => cat_59
triton_poi_fused_cat_127 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_127(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1304576
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 10192
    x1 = (xindex // 10192)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (20384*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 52, 14, 14), (10192, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 52, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_127.run(*args, 1304576, grid=grid(1304576), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_127.benchmark_all_configs(*args, 1304576, grid=grid(1304576))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/qf/cqfc2p7t5thx2bnsyx6ihjsqv65lizzzkgfndnkunfne6ahlxrbx.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_189
# aten.add => add_346
# aten.native_batch_norm_backward => convert_element_type_614, mul_839, sub_206, sum_85, sum_86
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_128 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[512, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: 'i32', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_128(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 416
    rnumel = 6272
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex % 104
    x1 = (xindex // 104)
    _tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    tmp7 = tl.load(in_ptr3 + (x0), xmask)
    _tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = tl.load(in_ptr0 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tl.load(in_ptr1 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp5 = tl.load(in_ptr2 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp2 = tmp0 + tmp1
        tmp3 = tmp2.to(tl.float32)
        _tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4)
        tmp6 = tmp5.to(tl.float32)
        tmp8 = tmp6 - tmp7
        tmp9 = tmp3 * tmp8
        _tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10)
    tmp4 = tl.sum(_tmp4, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp4, xmask)
    tmp10 = tl.sum(_tmp10, 1)[:, None]
    tl.store(out_ptr1 + x3, tmp10, xmask)


def get_args():
    arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_128.run(*args, 416, 6272, grid=grid(416), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_128.benchmark_all_configs(*args, 416, 6272, grid=grid(416))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/55/c55duaucwd5mqi2jcpis5psliz6udgc3zunl4l6gsysumyukuodb.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_189
# aten.add => add_346
# aten.native_batch_norm_backward => convert_element_type_614, convert_element_type_616, mul_845, mul_846, sub_206, sub_208, sub_209
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_129 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp16', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_129(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2609152
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 196) % 104
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
    tmp4 = tl.load(in_ptr2 + (x3), None).to(tl.float32)
    tmp6 = tl.load(in_ptr3 + (x1), None)
    tmp8 = tl.load(in_ptr4 + (x1), None)
    tmp11 = tl.load(in_ptr5 + (x1), None)
    tmp16 = tl.load(in_ptr6 + (x1), None)
    tmp19 = tl.load(in_ptr7 + (x1), None)
    tmp2 = tmp0 + tmp1
    tmp3 = tmp2.to(tl.float32)
    tmp5 = tmp4.to(tl.float32)
    tmp7 = tmp5 - tmp6
    tmp9 = 3.985969387755102e-05
    tmp10 = tmp8 * tmp9
    tmp12 = tmp11 * tmp11
    tmp13 = tmp10 * tmp12
    tmp14 = tmp7 * tmp13
    tmp15 = tmp3 - tmp14
    tmp17 = tmp16 * tmp9
    tmp18 = tmp15 - tmp17
    tmp20 = tmp11 * tmp19
    tmp21 = tmp18 * tmp20
    tmp22 = tmp21.to(tl.float32)
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None)


def get_args():
    arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_129.run(*args, 2609152, grid=grid(2609152), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_129.benchmark_all_configs(*args, 2609152, grid=grid(2609152))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/gi/cgidg4l74j3mxppzfsjtpi42hq4ihs47f6epftki5i5ywvfqu6wv.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_165
# aten.add => add_346, add_351
# aten.native_batch_norm_backward => convert_element_type_638, mul_879, sub_222, sum_94, sum_95
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_130 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[512, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_130(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 416
    rnumel = 6272
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex % 104
    x1 = (xindex // 104)
    _tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    tmp9 = tl.load(in_ptr4 + (x0), xmask)
    _tmp12 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = tl.load(in_ptr0 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tl.load(in_ptr1 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp3 = tl.load(in_ptr2 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp7 = tl.load(in_ptr3 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp2 = tmp0 + tmp1
        tmp4 = tmp2 + tmp3
        tmp5 = tmp4.to(tl.float32)
        _tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6)
        tmp8 = tmp7.to(tl.float32)
        tmp10 = tmp8 - tmp9
        tmp11 = tmp5 * tmp10
        _tmp12 = tl.where(rmask & xmask, _tmp12 + tmp11, _tmp12)
    tmp6 = tl.sum(_tmp6, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp6, xmask)
    tmp12 = tl.sum(_tmp12, 1)[:, None]
    tl.store(out_ptr1 + x3, tmp12, xmask)


def get_args():
    arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_130.run(*args, 416, 6272, grid=grid(416), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_130.benchmark_all_configs(*args, 416, 6272, grid=grid(416))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/nf/cnfybinrqgwpzwkognoekbtqzfldbno4q6jduhaywjxls5lwqbsd.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_165
# aten.add => add_346, add_351
# aten.native_batch_norm_backward => convert_element_type_638, mul_885, mul_886, sub_222, sub_224, sub_225
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_131 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_131(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2609152
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 196) % 104
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
    tmp3 = tl.load(in_ptr2 + (x3), None).to(tl.float32)
    tmp6 = tl.load(in_ptr3 + (x3), None).to(tl.float32)
    tmp8 = tl.load(in_ptr4 + (x1), None)
    tmp10 = tl.load(in_ptr5 + (x1), None)
    tmp13 = tl.load(in_ptr6 + (x1), None)
    tmp18 = tl.load(in_ptr7 + (x1), None)
    tmp21 = tl.load(in_ptr8 + (x1), None)
    tmp2 = tmp0 + tmp1
    tmp4 = tmp2 + tmp3
    tmp5 = tmp4.to(tl.float32)
    tmp7 = tmp6.to(tl.float32)
    tmp9 = tmp7 - tmp8
    tmp11 = 3.985969387755102e-05
    tmp12 = tmp10 * tmp11
    tmp14 = tmp13 * tmp13
    tmp15 = tmp12 * tmp14
    tmp16 = tmp9 * tmp15
    tmp17 = tmp5 - tmp16
    tmp19 = tmp18 * tmp11
    tmp20 = tmp17 - tmp19
    tmp22 = tmp13 * tmp21
    tmp23 = tmp20 * tmp22
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp23, None)


def get_args():
    arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
    arg_9 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_131.run(*args, 2609152, grid=grid(2609152), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_131.benchmark_all_configs(*args, 2609152, grid=grid(2609152))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/e6/ce6moc5iqgt57e7eqjuf5zhxhh5tsl3aixs4uw2xn3bhmzxi4yko.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_91
triton_poi_fused_convolution_backward_132 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_132(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1304576
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 10192
    x1 = (xindex // 10192)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (10192 + x0 + (20384*x1)), None)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, None)


def get_args():
    arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((128, 52, 14, 14), (10192, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_132.run(*args, 1304576, grid=grid(1304576), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_132.benchmark_all_configs(*args, 1304576, grid=grid(1304576))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/dk/cdkheqxbj4lk4wbn4mfiw2m45ufmxv76zkupgnoqujmqz7cf3flk.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_92
triton_poi_fused_convolution_backward_133 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_133(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1304576
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 10192
    x1 = (xindex // 10192)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (20384*x1)), None)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, None)


def get_args():
    arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((128, 52, 14, 14), (10192, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_133.run(*args, 1304576, grid=grid(1304576), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_133.benchmark_all_configs(*args, 1304576, grid=grid(1304576))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/w3/cw3zbb3x7kiu4ifygfhy3eaz6tyephtwgkxcx7rkmsrsvtrbrcix.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_141
# aten.add => add_346, add_351, add_356
# aten.native_batch_norm_backward => convert_element_type_662, mul_919, sub_238, sum_103, sum_104
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_134 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[512, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_134(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 416
    rnumel = 6272
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex % 104
    x1 = (xindex // 104)
    _tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    tmp11 = tl.load(in_ptr5 + (x0), xmask)
    _tmp14 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = tl.load(in_ptr0 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tl.load(in_ptr1 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp3 = tl.load(in_ptr2 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp5 = tl.load(in_ptr3 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp9 = tl.load(in_ptr4 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp2 = tmp0 + tmp1
        tmp4 = tmp2 + tmp3
        tmp6 = tmp4 + tmp5
        tmp7 = tmp6.to(tl.float32)
        _tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8)
        tmp10 = tmp9.to(tl.float32)
        tmp12 = tmp10 - tmp11
        tmp13 = tmp7 * tmp12
        _tmp14 = tl.where(rmask & xmask, _tmp14 + tmp13, _tmp14)
    tmp8 = tl.sum(_tmp8, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp8, xmask)
    tmp14 = tl.sum(_tmp14, 1)[:, None]
    tl.store(out_ptr1 + x3, tmp14, xmask)


def get_args():
    arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_5 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((104, 4), (1, 104), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_134.run(*args, 416, 6272, grid=grid(416), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_134.benchmark_all_configs(*args, 416, 6272, grid=grid(416))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ej/cejoyk2b6v7darhrxgzp23ovzotirkownmjt7wscn7uwlctg5oxk.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.convolution_backward, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_141
# aten.add => add_346, add_351, add_356
# aten.convolution_backward => convolution_backward_101
# aten.native_batch_norm_backward => convert_element_type_662, convert_element_type_664, mul_925, mul_926, sub_238, sub_240, sub_241
triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_135 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: '*fp16', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_135(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, in_ptr9, out_ptr1, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2609152
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 196) % 104
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
    tmp3 = tl.load(in_ptr2 + (x3), None).to(tl.float32)
    tmp5 = tl.load(in_ptr3 + (x3), None).to(tl.float32)
    tmp8 = tl.load(in_ptr4 + (x3), None).to(tl.float32)
    tmp10 = tl.load(in_ptr5 + (x1), None)
    tmp12 = tl.load(in_ptr6 + (x1), None)
    tmp15 = tl.load(in_ptr7 + (x1), None)
    tmp20 = tl.load(in_ptr8 + (x1), None)
    tmp23 = tl.load(in_ptr9 + (x1), None)
    tmp2 = tmp0 + tmp1
    tmp4 = tmp2 + tmp3
    tmp6 = tmp4 + tmp5
    tmp7 = tmp6.to(tl.float32)
    tmp9 = tmp8.to(tl.float32)
    tmp11 = tmp9 - tmp10
    tmp13 = 3.985969387755102e-05
    tmp14 = tmp12 * tmp13
    tmp16 = tmp15 * tmp15
    tmp17 = tmp14 * tmp16
    tmp18 = tmp11 * tmp17
    tmp19 = tmp7 - tmp18
    tmp21 = tmp20 * tmp13
    tmp22 = tmp19 - tmp21
    tmp24 = tmp15 * tmp23
    tmp25 = tmp22 * tmp24
    tmp26 = tmp25.to(tl.float32)
    tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp26, None)


def get_args():
    arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_5 = rand_strided((1, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
    arg_9 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
    arg_10 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, arg_10,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_135.run(*args, 2609152, grid=grid(2609152), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_135.benchmark_all_configs(*args, 2609152, grid=grid(2609152))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/5h/c5hpdahtkfn4l3dhqtwm7jlr542h6muhvdgkqrxb44tec4smg4fm.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_665
triton_poi_fused__to_copy_136 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_136(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 34944
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((104, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((104, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_136.run(*args, 34944, grid=grid(34944), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_136.benchmark_all_configs(*args, 34944, grid=grid(34944))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/bc/cbch6pezjexqee4mvwdnar33u6mvs47umwhwas6d2blv7733qelg.py
# Original ATen: aten.mul, aten.sigmoid, aten.sigmoid_backward, aten.silu, aten.sum

# aten.mul => mul_928
# aten.sigmoid => sigmoid_19
# aten.sigmoid_backward => convert_element_type_666, convert_element_type_667, convert_element_type_668, mul_930, mul_931, sub_242
# aten.silu => convert_element_type_132, convert_element_type_133, mul_178, sigmoid_17
# aten.sum => sum_105
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_137 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[65536, 256],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_137(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 43008
    rnumel = 196
    RBLOCK: tl.constexpr = 256
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (r1 + (196*x0)), rmask, other=0).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (r1 + (196*x0)), rmask, other=0).to(tl.float32)
    tmp11 = tl.load(in_ptr2 + (x0), None).to(tl.float32)
    tmp2 = tmp1.to(tl.float32)
    tmp3 = tl.sigmoid(tmp2)
    tmp4 = tmp2 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tmp6 = tmp0 * tmp5
    tmp8 = tl.where(rmask, tmp6, 0)
    tmp9 = tl.sum(tmp8, 1)[:, None]
    tmp10 = tmp9.to(tl.float32)
    tmp12 = tl.sigmoid(tmp11)
    tmp13 = tmp12.to(tl.float32)
    tmp14 = 1.0
    tmp15 = tmp14 - tmp13
    tmp16 = tmp13 * tmp15
    tmp17 = tmp10 * tmp16
    tmp18 = tmp17.to(tl.float32)
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp18, None)


def get_args():
    arg_0 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_137.run(*args, 43008, 196, grid=grid(43008), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_137.benchmark_all_configs(*args, 43008, 196, grid=grid(43008))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/lw/clwggjp2glb6lt7ovt2lc3pzpsocchmesscnc7d3a6zsnanmt43m.py
# Original ATen: aten._to_copy, aten.convolution_backward

# aten._to_copy => convert_element_type_670
# aten.convolution_backward => sum_106
triton_per_fused__to_copy_convolution_backward_138 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[512, 128],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__to_copy_convolution_backward_138(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 336
    rnumel = 128
    RBLOCK: tl.constexpr = 128
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (336*r1)), rmask & xmask, other=0).to(tl.float32)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)


def get_args():
    arg_0 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__to_copy_convolution_backward_138.run(*args, 336, 128, grid=grid(336), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__to_copy_convolution_backward_138.benchmark_all_configs(*args, 336, 128, grid=grid(336))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ny/cnyioznughbctzhfqmivqbazouomcdvadukxtk2rxq6q22lk25nc.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_669
triton_poi_fused__to_copy_139 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_139(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 4704
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((336, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((336, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_139.run(*args, 4704, grid=grid(4704), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_139.benchmark_all_configs(*args, 4704, grid=grid(4704))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/xy/cxy6onpt27zwbftqiiz3rmsdtwpb6ifqj55hnehr2kljkwmm2una.py
# Original ATen: aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub

# aten.add => add_357
# aten.clone => clone_14
# aten.fill => full_like_33
# aten.mul => mul_932, mul_933, mul_934
# aten.sigmoid => sigmoid_97
# aten.sub => sub_243
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_140 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_add_clone_fill_mul_sigmoid_sub_140(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1792
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = 1.0
    tmp4 = tmp3 - tmp2
    tmp5 = tmp1 * tmp4
    tmp6 = tmp5 + tmp3
    tmp7 = tmp2 * tmp6
    tmp8 = tmp0 * tmp7
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp8, xmask)


def get_args():
    arg_0 = rand_strided((128, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_add_clone_fill_mul_sigmoid_sub_140.run(*args, 1792, grid=grid(1792), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_add_clone_fill_mul_sigmoid_sub_140.benchmark_all_configs(*args, 1792, grid=grid(1792))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/af/cafrvpfoaadhdlxhldphg5leeyz2enwfb64ifixaudfchrcwkgns.py
# Original ATen: aten._to_copy, aten.convolution_backward

# aten._to_copy => convert_element_type_672
# aten.convolution_backward => sum_107
triton_per_fused__to_copy_convolution_backward_141 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[16, 128],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__to_copy_convolution_backward_141(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 14
    rnumel = 128
    RBLOCK: tl.constexpr = 128
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (14*r1)), rmask & xmask, other=0).to(tl.float32)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)


def get_args():
    arg_0 = rand_strided((128, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((14,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__to_copy_convolution_backward_141.run(*args, 14, 128, grid=grid(14), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__to_copy_convolution_backward_141.benchmark_all_configs(*args, 14, 128, grid=grid(14))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ub/cubt2wopxldea56edxrzorl54nnvgwjendwqgv23wj6it7xjbynx.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_671
triton_poi_fused__to_copy_142 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_142(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 4704
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((14, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((14, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_142.run(*args, 4704, grid=grid(4704), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_142.benchmark_all_configs(*args, 4704, grid=grid(4704))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/pw/cpwcwf6uhmujsg7jk3lvfx67kggnjin7oj2locjyrnecqe2jebap.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub

# aten._native_batch_norm_legit_functional => convert_element_type_130
# aten.add => add_358, add_359
# aten.clone => clone_13
# aten.div => div_12
# aten.fill => full_like_34
# aten.mul => mul_929, mul_935, mul_936, mul_937
# aten.native_batch_norm_backward => convert_element_type_673, mul_938, mul_946, sub_245, sum_108, sum_109
# aten.sigmoid => sigmoid_19, sigmoid_98
# aten.sub => sub_244
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_143 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[512, 32768],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_143(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 336
    rnumel = 25088
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    tmp20 = tl.load(in_ptr5 + (x0), xmask)
    _tmp23 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 196
        r2 = (rindex // 196)
        tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (65856*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tl.load(in_ptr1 + (x0 + (336*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp4 = tl.load(in_ptr2 + (x0 + (336*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp8 = tl.load(in_ptr3 + (r1 + (196*x0) + (65856*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp18 = tl.load(in_ptr4 + (r1 + (196*x0) + (65856*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp2 = tl.sigmoid(tmp1)
        tmp3 = tmp0 * tmp2
        tmp5 = 196.0
        tmp6 = tmp4 / tmp5
        tmp7 = tmp3 + tmp6
        tmp9 = tl.sigmoid(tmp8)
        tmp10 = 1.0
        tmp11 = tmp10 - tmp9
        tmp12 = tmp8 * tmp11
        tmp13 = tmp12 + tmp10
        tmp14 = tmp9 * tmp13
        tmp15 = tmp7 * tmp14
        tmp16 = tmp15.to(tl.float32)
        _tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17)
        tmp19 = tmp18.to(tl.float32)
        tmp21 = tmp19 - tmp20
        tmp22 = tmp16 * tmp21
        _tmp23 = tl.where(rmask & xmask, _tmp23 + tmp22, _tmp23)
    tmp17 = tl.sum(_tmp17, 1)[:, None]
    tl.store(out_ptr0 + x0, tmp17, xmask)
    tmp23 = tl.sum(_tmp23, 1)[:, None]
    tl.store(out_ptr1 + x0, tmp23, xmask)
    tmp24 = tl.load(in_ptr6 + (x0), xmask)
    tmp25 = tmp23 * tmp24
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp25, xmask)


def get_args():
    arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_5 = rand_strided((1, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_9 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_143.run(*args, 336, 25088, grid=grid(336), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_143.benchmark_all_configs(*args, 336, 25088, grid=grid(336))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/7p/c7psulr34da3bohr63ebglp7ja7h35fmmewyuanlwy56v67tyr3o.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub

# aten._native_batch_norm_legit_functional => convert_element_type_130
# aten.add => add_358, add_359
# aten.clone => clone_13
# aten.div => div_12
# aten.fill => full_like_34
# aten.mul => mul_929, mul_935, mul_936, mul_937
# aten.native_batch_norm_backward => convert_element_type_673, mul_944, sub_245, sub_247, sub_248
# aten.sigmoid => sigmoid_19, sigmoid_98
# aten.sub => sub_244
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_144 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_144(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 8429568
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x4 = (xindex // 196)
    x1 = (xindex // 196) % 336
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32)
    tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32)
    tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32)
    tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32)
    tmp19 = tl.load(in_ptr5 + (x1), None)
    tmp21 = tl.load(in_ptr6 + (x1), None)
    tmp24 = tl.load(in_ptr7 + (x1), None)
    tmp29 = tl.load(in_ptr8 + (x1), None)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp0 * tmp2
    tmp5 = 196.0
    tmp6 = tmp4 / tmp5
    tmp7 = tmp3 + tmp6
    tmp9 = tl.sigmoid(tmp8)
    tmp10 = 1.0
    tmp11 = tmp10 - tmp9
    tmp12 = tmp8 * tmp11
    tmp13 = tmp12 + tmp10
    tmp14 = tmp9 * tmp13
    tmp15 = tmp7 * tmp14
    tmp16 = tmp15.to(tl.float32)
    tmp18 = tmp17.to(tl.float32)
    tmp20 = tmp18 - tmp19
    tmp22 = 3.985969387755102e-05
    tmp23 = tmp21 * tmp22
    tmp25 = tmp24 * tmp24
    tmp26 = tmp23 * tmp25
    tmp27 = tmp20 * tmp26
    tmp28 = tmp16 - tmp27
    tmp30 = tmp29 * tmp22
    tmp31 = tmp28 - tmp30
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp31, None)


def get_args():
    arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_5 = rand_strided((1, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_9 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_144.run(*args, 8429568, grid=grid(8429568), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_144.benchmark_all_configs(*args, 8429568, grid=grid(8429568))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/id/cidvqsl7tausuxdym44zowgtyle4qgcqwkmvhutx6ld6q33q6idp.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_104
triton_poi_fused_convolution_backward_145 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_145(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2809856
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = (xindex // 21952)
    x3 = xindex % 21952
    x1 = (xindex // 196) % 112
    x4 = xindex
    tmp0 = tl.load(in_ptr0 + (43904 + x3 + (65856*x2)), None)
    tmp1 = tl.load(in_ptr1 + (224 + x1), None)
    tmp2 = tl.load(in_ptr2 + (224 + x1), None)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp0 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)


def get_args():
    arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((128, 112, 14, 14), (21952, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_145.run(*args, 2809856, grid=grid(2809856), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_145.benchmark_all_configs(*args, 2809856, grid=grid(2809856))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/fd/cfdopkndut6mz6wq6bvbwuuulhnrmhdrv5rzyqs6re2kufkzwbyi.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_676
triton_poi_fused__to_copy_146 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_146(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 5488
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((112, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((112, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_146.run(*args, 5488, grid=grid(5488), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_146.benchmark_all_configs(*args, 5488, grid=grid(5488))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/th/cthuea2iprfudqgjdcjb4l6mtcfp43ogq3yml625vwzkim7u4s2l.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_105
triton_poi_fused_convolution_backward_147 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_147(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2809856
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = (xindex // 21952)
    x3 = xindex % 21952
    x1 = (xindex // 196) % 112
    x4 = xindex
    tmp0 = tl.load(in_ptr0 + (21952 + x3 + (65856*x2)), None)
    tmp1 = tl.load(in_ptr1 + (112 + x1), None)
    tmp2 = tl.load(in_ptr2 + (112 + x1), None)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp0 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)


def get_args():
    arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((128, 112, 14, 14), (21952, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_147.run(*args, 2809856, grid=grid(2809856), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_147.benchmark_all_configs(*args, 2809856, grid=grid(2809856))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/3x/c3xs7plvafsbnfj7oqnbllamz4ftcowca37hxufdgln6b4kvegly.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_677
triton_poi_fused__to_copy_148 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_148(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2800
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((112, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((112, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_148.run(*args, 2800, grid=grid(2800), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_148.benchmark_all_configs(*args, 2800, grid=grid(2800))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/zr/czrwimvnigaum3atz3dnqh3vpqefjj27el4vvkm52xud7pku2qdu.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_106
triton_poi_fused_convolution_backward_149 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_149(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2809856
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = (xindex // 21952)
    x3 = xindex % 21952
    x1 = (xindex // 196) % 112
    x4 = xindex
    tmp0 = tl.load(in_ptr0 + (x3 + (65856*x2)), None)
    tmp1 = tl.load(in_ptr1 + (x1), None)
    tmp2 = tl.load(in_ptr2 + (x1), None)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp0 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)


def get_args():
    arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((128, 112, 14, 14), (21952, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_149.run(*args, 2809856, grid=grid(2809856), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_149.benchmark_all_configs(*args, 2809856, grid=grid(2809856))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/2v/c2volwa6uwcetttnruu5jjo66mjrobvup6mcg6k7n4aknrx3iggt.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_678
triton_poi_fused__to_copy_150 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_150(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1008
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((112, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((112, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_150.run(*args, 1008, grid=grid(1008), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_150.benchmark_all_configs(*args, 1008, grid=grid(1008))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/we/cwefssxf5bvvy2y5bt7gv2qzizpcib7seqbqn2voklitwa4hshmd.py
# Original ATen: aten.cat

# aten.cat => cat_66
triton_poi_fused_cat_151 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_151(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 11239424
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 87808
    x1 = (xindex // 87808)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (263424*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 112, 28, 28), (87808, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 112, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_151.run(*args, 11239424, grid=grid(11239424), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_151.benchmark_all_configs(*args, 11239424, grid=grid(11239424))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/qf/cqfo4ck6nf3trupktl4ixfnhezf4tcvnlhrfn63fekgvbdyrr6da.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mul, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_123
# aten.mul => mul_949
# aten.native_batch_norm_backward => convert_element_type_679, mul_950, mul_958, sub_250, sum_110, sum_111
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_152 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[512, 131072],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_152(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 336
    rnumel = 100352
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    tmp7 = tl.load(in_ptr3 + (x0), xmask)
    _tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 784
        r2 = (rindex // 784)
        tmp0 = tl.load(in_ptr0 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tl.load(in_ptr1 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp5 = tl.load(in_ptr2 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp2 = tmp0 * tmp1
        tmp3 = tmp2.to(tl.float32)
        _tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4)
        tmp6 = tmp5.to(tl.float32)
        tmp8 = tmp6 - tmp7
        tmp9 = tmp3 * tmp8
        _tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10)
    tmp4 = tl.sum(_tmp4, 1)[:, None]
    tl.store(out_ptr0 + x0, tmp4, xmask)
    tmp10 = tl.sum(_tmp10, 1)[:, None]
    tl.store(out_ptr1 + x0, tmp10, xmask)
    tmp11 = tl.load(in_ptr4 + (x0), xmask)
    tmp12 = tmp10 * tmp11
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp12, xmask)


def get_args():
    arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_152.run(*args, 336, 100352, grid=grid(336), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_152.benchmark_all_configs(*args, 336, 100352, grid=grid(336))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/qc/cqcwwcxya4t45mgfdz46w2lkm3l3dt6ws7mawrdyrpvgbmqxgblv.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.convolution_backward, aten.mul, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_123
# aten.convolution_backward => convolution_backward_107
# aten.mul => mul_949
# aten.native_batch_norm_backward => convert_element_type_679, convert_element_type_681, mul_956, mul_957, sub_250, sub_252, sub_253
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_153 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_153(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr):
    xnumel = 33718272
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 784) % 336
    tmp0 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32)
    tmp1 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp4 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
    tmp6 = tl.load(in_ptr2 + (x1), None)
    tmp8 = tl.load(in_ptr3 + (x1), None)
    tmp11 = tl.load(in_ptr4 + (x1), None)
    tmp16 = tl.load(in_ptr5 + (x1), None)
    tmp19 = tl.load(in_ptr6 + (x1), None)
    tmp2 = tmp0 * tmp1
    tmp3 = tmp2.to(tl.float32)
    tmp5 = tmp4.to(tl.float32)
    tmp7 = tmp5 - tmp6
    tmp9 = 9.964923469387754e-06
    tmp10 = tmp8 * tmp9
    tmp12 = tmp11 * tmp11
    tmp13 = tmp10 * tmp12
    tmp14 = tmp7 * tmp13
    tmp15 = tmp3 - tmp14
    tmp17 = tmp16 * tmp9
    tmp18 = tmp15 - tmp17
    tmp20 = tmp11 * tmp19
    tmp21 = tmp18 * tmp20
    tmp22 = tmp21.to(tl.float32)
    tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None)


def get_args():
    arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_153.run(*args, 33718272, grid=grid(33718272), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_153.benchmark_all_configs(*args, 33718272, grid=grid(33718272))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/xp/cxpyb2d7tgj3ajjcfk5ruyhc5u2uszkeh76gef4njgyf5kz5ukth.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_682
triton_poi_fused__to_copy_154 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_154(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 18816
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((336, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((336, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_154.run(*args, 18816, grid=grid(18816), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_154.benchmark_all_configs(*args, 18816, grid=grid(18816))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ku/cku7ek7gtwfrqcc34nfk6i7wvwqtzc57tln743w3jpx2qsiyexs7.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_120
# aten.native_batch_norm_backward => convert_element_type_683, mul_959, sub_254, sum_112, sum_113
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_155 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[1024, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_155(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 728
    rnumel = 7720
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x1 = (xindex // 56)
    x0 = xindex % 56
    _tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    _tmp13 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = r2 + (7720*x1)
        tmp1 = 100352
        tmp2 = tmp0 < tmp1
        tmp3 = tl.load(in_ptr0 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp4 = tmp3.to(tl.float32)
        tmp5 = tl.where(tmp2, tmp4, 0)
        _tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6)
        tmp7 = tl.load(in_ptr1 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp8 = tmp7.to(tl.float32)
        tmp9 = tl.load(in_ptr2 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0)
        tmp10 = tmp8 - tmp9
        tmp11 = tmp4 * tmp10
        tmp12 = tl.where(tmp2, tmp11, 0)
        _tmp13 = tl.where(rmask & xmask, _tmp13 + tmp12, _tmp13)
    tmp6 = tl.sum(_tmp6, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp6, xmask)
    tmp13 = tl.sum(_tmp13, 1)[:, None]
    tl.store(out_ptr1 + x3, tmp13, xmask)


def get_args():
    arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((1, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_155.run(*args, 728, 7720, grid=grid(728), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_155.benchmark_all_configs(*args, 728, 7720, grid=grid(728))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/uy/cuyabiwct4xwna22y54u6ef6hgcysgudgvw53xgcxhoetvhdud3c.py
# Original ATen: aten.native_batch_norm_backward

# aten.native_batch_norm_backward => convert_element_type_683, sum_112
triton_per_fused_native_batch_norm_backward_156 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[64, 16],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}
)
@triton.jit
def triton_per_fused_native_batch_norm_backward_156(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 56
    rnumel = 13
    RBLOCK: tl.constexpr = 16
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (56*r1)), rmask & xmask, other=0)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tl.store(out_ptr0 + x0, tmp3, xmask)


def get_args():
    arg_0 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused_native_batch_norm_backward_156.run(*args, 56, 13, grid=grid(56), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused_native_batch_norm_backward_156.benchmark_all_configs(*args, 56, 13, grid=grid(56))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/on/convc5p74ddudqk46e52onvmx3rxjfktebgzjuhxneits4eo7nyc.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_120
# aten.native_batch_norm_backward => convert_element_type_683, mul_959, mul_967, sub_254, sum_113
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_157 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[64, 16],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_157(in_ptr0, in_ptr1, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 56
    rnumel = 13
    RBLOCK: tl.constexpr = 16
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (56*r1)), rmask & xmask, other=0)
    tmp4 = tl.load(in_ptr1 + (x0), xmask)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp5 = tmp3 * tmp4
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
    tl.store(out_ptr0 + x0, tmp3, xmask)


def get_args():
    arg_0 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_157.run(*args, 56, 13, grid=grid(56), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_157.benchmark_all_configs(*args, 56, 13, grid=grid(56))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/bj/cbjvyothjvn62eovj2qvbnhqdnxtnyarsgyu3rshwon5wrxfm6me.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_120
# aten.native_batch_norm_backward => convert_element_type_683, convert_element_type_685, mul_965, mul_966, sub_254, sub_256, sub_257
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_158 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp16', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_158(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 5619712
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 784) % 56
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
    tmp4 = tl.load(in_ptr2 + (x1), None)
    tmp6 = tl.load(in_ptr3 + (x1), None)
    tmp9 = tl.load(in_ptr4 + (x1), None)
    tmp14 = tl.load(in_ptr5 + (x1), None)
    tmp17 = tl.load(in_ptr6 + (x1), None)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp2.to(tl.float32)
    tmp5 = tmp3 - tmp4
    tmp7 = 9.964923469387754e-06
    tmp8 = tmp6 * tmp7
    tmp10 = tmp9 * tmp9
    tmp11 = tmp8 * tmp10
    tmp12 = tmp5 * tmp11
    tmp13 = tmp1 - tmp12
    tmp15 = tmp14 * tmp7
    tmp16 = tmp13 - tmp15
    tmp18 = tmp9 * tmp17
    tmp19 = tmp16 * tmp18
    tmp20 = tmp19.to(tl.float32)
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp20, None)


def get_args():
    arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((1, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_158.run(*args, 5619712, grid=grid(5619712), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_158.benchmark_all_configs(*args, 5619712, grid=grid(5619712))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ip/cipdkjyk2onxl3m3jynzjqksmm63qjqd3qbbbq47kgdhlig5ktev.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_686
triton_poi_fused__to_copy_159 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_159(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 4704
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_159.run(*args, 4704, grid=grid(4704), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_159.benchmark_all_configs(*args, 4704, grid=grid(4704))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/hk/chko7h4vzts7m5h2r3fcixp646xti7wwu6pppgto4mg3xqc3ieno.py
# Original ATen: aten.cat

# aten.cat => cat_67
triton_poi_fused_cat_160 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_160(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 16859136
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 131712
    x1 = (xindex // 131712)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (263424*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 168, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_160.run(*args, 16859136, grid=grid(16859136), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_160.benchmark_all_configs(*args, 16859136, grid=grid(16859136))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/wf/cwfwcfiqllapezhy4wsh7mb3i7a2codkip2cnbeie26v347spvpq.py
# Original ATen: aten.mul, aten.sigmoid, aten.sigmoid_backward, aten.silu, aten.sum

# aten.mul => mul_968
# aten.sigmoid => sigmoid_15
# aten.sigmoid_backward => convert_element_type_688, convert_element_type_689, convert_element_type_690, mul_970, mul_971, sub_258
# aten.silu => convert_element_type_110, convert_element_type_111, mul_153, sigmoid_13
# aten.sum => sum_114
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_161 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[65536, 1024],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]}
)
@triton.jit
def triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_161(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 43008
    rnumel = 784
    RBLOCK: tl.constexpr = 1024
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (r1 + (784*x0)), rmask, other=0).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (r1 + (784*x0)), rmask, other=0).to(tl.float32)
    tmp11 = tl.load(in_ptr2 + (x0), None).to(tl.float32)
    tmp2 = tmp1.to(tl.float32)
    tmp3 = tl.sigmoid(tmp2)
    tmp4 = tmp2 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tmp6 = tmp0 * tmp5
    tmp8 = tl.where(rmask, tmp6, 0)
    tmp9 = tl.sum(tmp8, 1)[:, None]
    tmp10 = tmp9.to(tl.float32)
    tmp12 = tl.sigmoid(tmp11)
    tmp13 = tmp12.to(tl.float32)
    tmp14 = 1.0
    tmp15 = tmp14 - tmp13
    tmp16 = tmp13 * tmp15
    tmp17 = tmp10 * tmp16
    tmp18 = tmp17.to(tl.float32)
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp18, None)


def get_args():
    arg_0 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_161.run(*args, 43008, 784, grid=grid(43008), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_161.benchmark_all_configs(*args, 43008, 784, grid=grid(43008))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/h6/ch66mav6cpspgg34bc2ckyogaizco43c7rwo2n4rkshr5666vpid.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_691
triton_poi_fused__to_copy_162 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_162(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 9408
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((336, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((336, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_162.run(*args, 9408, grid=grid(9408), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_162.benchmark_all_configs(*args, 9408, grid=grid(9408))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/fi/cfibawhd6jznqxl7q2atcugp7zx43kt5jtkntbghy42ljr2ytlun.py
# Original ATen: aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub

# aten.add => add_361
# aten.clone => clone_11
# aten.fill => full_like_36
# aten.mul => mul_972, mul_973, mul_974
# aten.sigmoid => sigmoid_100
# aten.sub => sub_259
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_163 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_add_clone_fill_mul_sigmoid_sub_163(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3584
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = 1.0
    tmp4 = tmp3 - tmp2
    tmp5 = tmp1 * tmp4
    tmp6 = tmp5 + tmp3
    tmp7 = tmp2 * tmp6
    tmp8 = tmp0 * tmp7
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp8, xmask)


def get_args():
    arg_0 = rand_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_add_clone_fill_mul_sigmoid_sub_163.run(*args, 3584, grid=grid(3584), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_add_clone_fill_mul_sigmoid_sub_163.benchmark_all_configs(*args, 3584, grid=grid(3584))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/yv/cyvint5v6fejjcxz54v5axwxdzytfpjmq7eoqsppyu45zsqim3l2.py
# Original ATen: aten._to_copy, aten.convolution_backward

# aten._to_copy => convert_element_type_694
# aten.convolution_backward => sum_116
triton_per_fused__to_copy_convolution_backward_164 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[32, 128],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__to_copy_convolution_backward_164(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 28
    rnumel = 128
    RBLOCK: tl.constexpr = 128
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (28*r1)), rmask & xmask, other=0).to(tl.float32)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)


def get_args():
    arg_0 = rand_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((28,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__to_copy_convolution_backward_164.run(*args, 28, 128, grid=grid(28), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__to_copy_convolution_backward_164.benchmark_all_configs(*args, 28, 128, grid=grid(28))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/pk/cpkfpj7moawggcqkv5ok3cwxbz2nv6myxthuduj53xnw6qw3dwig.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_693
triton_poi_fused__to_copy_165 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_165(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 9408
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((28, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((28, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_165.run(*args, 9408, grid=grid(9408), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_165.benchmark_all_configs(*args, 9408, grid=grid(9408))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/q7/cq7zl54kvzo64w2eq2dfuf5wrsv6grvd6fo3lgq77a5kcc7cs6ek.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub

# aten._native_batch_norm_legit_functional => convert_element_type_108
# aten.add => add_362, add_363
# aten.clone => clone_10
# aten.div => div_13
# aten.fill => full_like_37
# aten.mul => mul_969, mul_975, mul_976, mul_977
# aten.native_batch_norm_backward => convert_element_type_695, mul_978, mul_986, sub_261, sum_117, sum_118
# aten.sigmoid => sigmoid_101, sigmoid_15
# aten.sub => sub_260
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_166 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[512, 131072],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_166(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 336
    rnumel = 100352
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    tmp20 = tl.load(in_ptr5 + (x0), xmask)
    _tmp23 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 784
        r2 = (rindex // 784)
        tmp0 = tl.load(in_ptr0 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tl.load(in_ptr1 + (x0 + (336*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp4 = tl.load(in_ptr2 + (x0 + (336*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp8 = tl.load(in_ptr3 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp18 = tl.load(in_ptr4 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp2 = tl.sigmoid(tmp1)
        tmp3 = tmp0 * tmp2
        tmp5 = 784.0
        tmp6 = tmp4 / tmp5
        tmp7 = tmp3 + tmp6
        tmp9 = tl.sigmoid(tmp8)
        tmp10 = 1.0
        tmp11 = tmp10 - tmp9
        tmp12 = tmp8 * tmp11
        tmp13 = tmp12 + tmp10
        tmp14 = tmp9 * tmp13
        tmp15 = tmp7 * tmp14
        tmp16 = tmp15.to(tl.float32)
        _tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17)
        tmp19 = tmp18.to(tl.float32)
        tmp21 = tmp19 - tmp20
        tmp22 = tmp16 * tmp21
        _tmp23 = tl.where(rmask & xmask, _tmp23 + tmp22, _tmp23)
    tmp17 = tl.sum(_tmp17, 1)[:, None]
    tl.store(out_ptr0 + x0, tmp17, xmask)
    tmp23 = tl.sum(_tmp23, 1)[:, None]
    tl.store(out_ptr1 + x0, tmp23, xmask)
    tmp24 = tl.load(in_ptr6 + (x0), xmask)
    tmp25 = tmp23 * tmp24
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp25, xmask)


def get_args():
    arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_5 = rand_strided((1, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_9 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_166.run(*args, 336, 100352, grid=grid(336), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_166.benchmark_all_configs(*args, 336, 100352, grid=grid(336))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/rc/crcc3il2qglpofbslbuwlevrp5dihafxcr4wxlvjfxaasqpi56hu.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub

# aten._native_batch_norm_legit_functional => convert_element_type_108
# aten.add => add_362, add_363
# aten.clone => clone_10
# aten.div => div_13
# aten.fill => full_like_37
# aten.mul => mul_969, mul_975, mul_976, mul_977
# aten.native_batch_norm_backward => convert_element_type_695, mul_984, sub_261, sub_263, sub_264
# aten.sigmoid => sigmoid_101, sigmoid_15
# aten.sub => sub_260
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_167 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_167(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 33718272
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x4 = (xindex // 784)
    x1 = (xindex // 784) % 336
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32)
    tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32)
    tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32)
    tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32)
    tmp19 = tl.load(in_ptr5 + (x1), None)
    tmp21 = tl.load(in_ptr6 + (x1), None)
    tmp24 = tl.load(in_ptr7 + (x1), None)
    tmp29 = tl.load(in_ptr8 + (x1), None)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp0 * tmp2
    tmp5 = 784.0
    tmp6 = tmp4 / tmp5
    tmp7 = tmp3 + tmp6
    tmp9 = tl.sigmoid(tmp8)
    tmp10 = 1.0
    tmp11 = tmp10 - tmp9
    tmp12 = tmp8 * tmp11
    tmp13 = tmp12 + tmp10
    tmp14 = tmp9 * tmp13
    tmp15 = tmp7 * tmp14
    tmp16 = tmp15.to(tl.float32)
    tmp18 = tmp17.to(tl.float32)
    tmp20 = tmp18 - tmp19
    tmp22 = 9.964923469387754e-06
    tmp23 = tmp21 * tmp22
    tmp25 = tmp24 * tmp24
    tmp26 = tmp23 * tmp25
    tmp27 = tmp20 * tmp26
    tmp28 = tmp16 - tmp27
    tmp30 = tmp29 * tmp22
    tmp31 = tmp28 - tmp30
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp31, None)


def get_args():
    arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_5 = rand_strided((1, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_9 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_167.run(*args, 33718272, grid=grid(33718272), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_167.benchmark_all_configs(*args, 33718272, grid=grid(33718272))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/7y/c7y3vo54x6g4hqnvyjz4k3ngokc66wmd3u7zsbjrmhorsy53o4sb.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_112
triton_poi_fused_convolution_backward_168 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_168(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 16859136
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = (xindex // 131712)
    x3 = xindex % 131712
    x1 = (xindex // 784) % 168
    x4 = xindex
    tmp0 = tl.load(in_ptr0 + (131712 + x3 + (263424*x2)), None)
    tmp1 = tl.load(in_ptr1 + (168 + x1), None)
    tmp2 = tl.load(in_ptr2 + (168 + x1), None)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp0 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)


def get_args():
    arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_168.run(*args, 16859136, grid=grid(16859136), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_168.benchmark_all_configs(*args, 16859136, grid=grid(16859136))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/6n/c6nredjp54hkor7tbo72xhpqqat3vpewekmkr4kkyypw6cgqm6ai.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_698
triton_poi_fused__to_copy_169 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_169(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 4200
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((168, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((168, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_169.run(*args, 4200, grid=grid(4200), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_169.benchmark_all_configs(*args, 4200, grid=grid(4200))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ia/ciabi2inw7dvbjd5nathz3xfqaodfqjylvjwgsdeimxatolwwqks.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_113
triton_poi_fused_convolution_backward_170 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_170(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 16859136
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = (xindex // 131712)
    x3 = xindex % 131712
    x1 = (xindex // 784) % 168
    x4 = xindex
    tmp0 = tl.load(in_ptr0 + (x3 + (263424*x2)), None)
    tmp1 = tl.load(in_ptr1 + (x1), None)
    tmp2 = tl.load(in_ptr2 + (x1), None)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp0 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)


def get_args():
    arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_170.run(*args, 16859136, grid=grid(16859136), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_170.benchmark_all_configs(*args, 16859136, grid=grid(16859136))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/c4/cc4n2v6wqwhmmrx6e37v63wrmsqntez34qcciaujxy6fxsa2gk3o.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_699
triton_poi_fused__to_copy_171 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_171(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1512
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((168, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((168, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_171.run(*args, 1512, grid=grid(1512), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_171.benchmark_all_configs(*args, 1512, grid=grid(1512))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ak/cakbaw2uk6hhpcroelbhhfo57dmqlxwcgkogm2m6ds3sv57t2vqz.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_703
triton_poi_fused__to_copy_172 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_172(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 4704
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_172.run(*args, 4704, grid=grid(4704), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_172.benchmark_all_configs(*args, 4704, grid=grid(4704))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ow/cowu2kdmjh4mnuy3o2jt5hyxyjhq6tnzxq4b3gah2vdhgpih4okk.py
# Original ATen: aten.cat

# aten.cat => cat_69
triton_poi_fused_cat_173 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_173(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2809856
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 21952
    x1 = (xindex // 21952)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (43904*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 28, 28, 28), (21952, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 28, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_173.run(*args, 2809856, grid=grid(2809856), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_173.benchmark_all_configs(*args, 2809856, grid=grid(2809856))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/tu/ctuwkh7mmjcoxpqhlozf53j6kx7nu6pcnpqqmwor4d57vrqdsosn.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_98
# aten.add => add_365
# aten.native_batch_norm_backward => convert_element_type_705, mul_999, sub_270, sum_121, sum_122
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_174 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[1024, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: 'i32', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_174(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 728
    rnumel = 7720
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x1 = (xindex // 56)
    x0 = xindex % 56
    _tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    _tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = r2 + (7720*x1)
        tmp1 = 100352
        tmp2 = tmp0 < tmp1
        tmp3 = tl.load(in_ptr0 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp4 = tl.load(in_ptr1 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp5 = tmp3 + tmp4
        tmp6 = tmp5.to(tl.float32)
        tmp7 = tl.where(tmp2, tmp6, 0)
        _tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8)
        tmp9 = tl.load(in_ptr2 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp10 = tmp9.to(tl.float32)
        tmp11 = tl.load(in_ptr3 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0)
        tmp12 = tmp10 - tmp11
        tmp13 = tmp6 * tmp12
        tmp14 = tl.where(tmp2, tmp13, 0)
        _tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
    tmp8 = tl.sum(_tmp8, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp8, xmask)
    tmp15 = tl.sum(_tmp15, 1)[:, None]
    tl.store(out_ptr1 + x3, tmp15, xmask)


def get_args():
    arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_174.run(*args, 728, 7720, grid=grid(728), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_174.benchmark_all_configs(*args, 728, 7720, grid=grid(728))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/3q/c3qu2jno53qez6kspx5ta57fq4vavwnpqfdqbctsz2h5gfllev2o.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_98
# aten.add => add_365
# aten.native_batch_norm_backward => convert_element_type_705, convert_element_type_707, mul_1005, mul_1006, sub_270, sub_272, sub_273
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_175 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp16', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_175(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 5619712
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 784) % 56
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
    tmp4 = tl.load(in_ptr2 + (x3), None).to(tl.float32)
    tmp6 = tl.load(in_ptr3 + (x1), None)
    tmp8 = tl.load(in_ptr4 + (x1), None)
    tmp11 = tl.load(in_ptr5 + (x1), None)
    tmp16 = tl.load(in_ptr6 + (x1), None)
    tmp19 = tl.load(in_ptr7 + (x1), None)
    tmp2 = tmp0 + tmp1
    tmp3 = tmp2.to(tl.float32)
    tmp5 = tmp4.to(tl.float32)
    tmp7 = tmp5 - tmp6
    tmp9 = 9.964923469387754e-06
    tmp10 = tmp8 * tmp9
    tmp12 = tmp11 * tmp11
    tmp13 = tmp10 * tmp12
    tmp14 = tmp7 * tmp13
    tmp15 = tmp3 - tmp14
    tmp17 = tmp16 * tmp9
    tmp18 = tmp15 - tmp17
    tmp20 = tmp11 * tmp19
    tmp21 = tmp18 * tmp20
    tmp22 = tmp21.to(tl.float32)
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None)


def get_args():
    arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_175.run(*args, 5619712, grid=grid(5619712), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_175.benchmark_all_configs(*args, 5619712, grid=grid(5619712))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/he/cheuhrmgcxe3wl2kq52bt7crzek5t4ji6b56jrybsdffzrqao7el.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_76
# aten.add => add_365, add_370
# aten.native_batch_norm_backward => convert_element_type_727, mul_1039, sub_286, sum_130, sum_131
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_176 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[1024, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_176(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 728
    rnumel = 7720
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x1 = (xindex // 56)
    x0 = xindex % 56
    _tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    _tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = r2 + (7720*x1)
        tmp1 = 100352
        tmp2 = tmp0 < tmp1
        tmp3 = tl.load(in_ptr0 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp4 = tl.load(in_ptr1 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp5 = tmp3 + tmp4
        tmp6 = tl.load(in_ptr2 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp7 = tmp5 + tmp6
        tmp8 = tmp7.to(tl.float32)
        tmp9 = tl.where(tmp2, tmp8, 0)
        _tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10)
        tmp11 = tl.load(in_ptr3 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp12 = tmp11.to(tl.float32)
        tmp13 = tl.load(in_ptr4 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0)
        tmp14 = tmp12 - tmp13
        tmp15 = tmp8 * tmp14
        tmp16 = tl.where(tmp2, tmp15, 0)
        _tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17)
    tmp10 = tl.sum(_tmp10, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp10, xmask)
    tmp17 = tl.sum(_tmp17, 1)[:, None]
    tl.store(out_ptr1 + x3, tmp17, xmask)


def get_args():
    arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((1, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_176.run(*args, 728, 7720, grid=grid(728), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_176.benchmark_all_configs(*args, 728, 7720, grid=grid(728))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/6w/c6wk3ggkhqc4dpm4svd2nw2x7bsbl6xzqqaoct6gtlyubuviyxj6.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_76
# aten.add => add_365, add_370
# aten.native_batch_norm_backward => convert_element_type_727, mul_1045, mul_1046, sub_286, sub_288, sub_289
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_177 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_177(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 5619712
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 784) % 56
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
    tmp3 = tl.load(in_ptr2 + (x3), None).to(tl.float32)
    tmp6 = tl.load(in_ptr3 + (x3), None).to(tl.float32)
    tmp8 = tl.load(in_ptr4 + (x1), None)
    tmp10 = tl.load(in_ptr5 + (x1), None)
    tmp13 = tl.load(in_ptr6 + (x1), None)
    tmp18 = tl.load(in_ptr7 + (x1), None)
    tmp21 = tl.load(in_ptr8 + (x1), None)
    tmp2 = tmp0 + tmp1
    tmp4 = tmp2 + tmp3
    tmp5 = tmp4.to(tl.float32)
    tmp7 = tmp6.to(tl.float32)
    tmp9 = tmp7 - tmp8
    tmp11 = 9.964923469387754e-06
    tmp12 = tmp10 * tmp11
    tmp14 = tmp13 * tmp13
    tmp15 = tmp12 * tmp14
    tmp16 = tmp9 * tmp15
    tmp17 = tmp5 - tmp16
    tmp19 = tmp18 * tmp11
    tmp20 = tmp17 - tmp19
    tmp22 = tmp13 * tmp21
    tmp23 = tmp20 * tmp22
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp23, None)


def get_args():
    arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((1, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
    arg_9 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_177.run(*args, 5619712, grid=grid(5619712), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_177.benchmark_all_configs(*args, 5619712, grid=grid(5619712))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/za/czavvoeeqv63efdv32prg5yimwonjxhjdak5qwnkzrdkukuwv7ls.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_124
triton_poi_fused_convolution_backward_178 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_178(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2809856
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 21952
    x1 = (xindex // 21952)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (21952 + x0 + (43904*x1)), None)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, None)


def get_args():
    arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((128, 28, 28, 28), (21952, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_178.run(*args, 2809856, grid=grid(2809856), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_178.benchmark_all_configs(*args, 2809856, grid=grid(2809856))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/4j/c4jmqpkqr4fnrmxwsxtkllpzeruqpmftklalesh626aid62ltfs5.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_125
triton_poi_fused_convolution_backward_179 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_179(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2809856
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 21952
    x1 = (xindex // 21952)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (43904*x1)), None)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp1, None)


def get_args():
    arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((128, 28, 28, 28), (21952, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_179.run(*args, 2809856, grid=grid(2809856), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_179.benchmark_all_configs(*args, 2809856, grid=grid(2809856))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/sd/csdu62ubjpfueiorjsdrofpul6ahwt73a2qyxlds2id4dzng2ykk.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_54
# aten.add => add_365, add_370, add_375
# aten.native_batch_norm_backward => convert_element_type_749, mul_1079, sub_302, sum_139, sum_140
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_180 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[1024, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_180(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 728
    rnumel = 7720
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x1 = (xindex // 56)
    x0 = xindex % 56
    _tmp12 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    _tmp19 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = r2 + (7720*x1)
        tmp1 = 100352
        tmp2 = tmp0 < tmp1
        tmp3 = tl.load(in_ptr0 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp4 = tl.load(in_ptr1 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp5 = tmp3 + tmp4
        tmp6 = tl.load(in_ptr2 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp7 = tmp5 + tmp6
        tmp8 = tl.load(in_ptr3 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp9 = tmp7 + tmp8
        tmp10 = tmp9.to(tl.float32)
        tmp11 = tl.where(tmp2, tmp10, 0)
        _tmp12 = tl.where(rmask & xmask, _tmp12 + tmp11, _tmp12)
        tmp13 = tl.load(in_ptr4 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp14 = tmp13.to(tl.float32)
        tmp15 = tl.load(in_ptr5 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0)
        tmp16 = tmp14 - tmp15
        tmp17 = tmp10 * tmp16
        tmp18 = tl.where(tmp2, tmp17, 0)
        _tmp19 = tl.where(rmask & xmask, _tmp19 + tmp18, _tmp19)
    tmp12 = tl.sum(_tmp12, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp12, xmask)
    tmp19 = tl.sum(_tmp19, 1)[:, None]
    tl.store(out_ptr1 + x3, tmp19, xmask)


def get_args():
    arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_5 = rand_strided((1, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((56, 13), (1, 56), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_180.run(*args, 728, 7720, grid=grid(728), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_180.benchmark_all_configs(*args, 728, 7720, grid=grid(728))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/tj/ctjcu5336nwmvr63gcfk75rymyzm6mnak553tgmn3erorw7u2qrh.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.convolution_backward, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_54
# aten.add => add_365, add_370, add_375
# aten.convolution_backward => convolution_backward_132
# aten.native_batch_norm_backward => convert_element_type_749, convert_element_type_751, mul_1085, mul_1086, sub_302, sub_304, sub_305
triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_181 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: '*fp16', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_181(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, in_ptr9, out_ptr1, xnumel, XBLOCK : tl.constexpr):
    xnumel = 5619712
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 784) % 56
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
    tmp3 = tl.load(in_ptr2 + (x3), None).to(tl.float32)
    tmp5 = tl.load(in_ptr3 + (x3), None).to(tl.float32)
    tmp8 = tl.load(in_ptr4 + (x3), None).to(tl.float32)
    tmp10 = tl.load(in_ptr5 + (x1), None)
    tmp12 = tl.load(in_ptr6 + (x1), None)
    tmp15 = tl.load(in_ptr7 + (x1), None)
    tmp20 = tl.load(in_ptr8 + (x1), None)
    tmp23 = tl.load(in_ptr9 + (x1), None)
    tmp2 = tmp0 + tmp1
    tmp4 = tmp2 + tmp3
    tmp6 = tmp4 + tmp5
    tmp7 = tmp6.to(tl.float32)
    tmp9 = tmp8.to(tl.float32)
    tmp11 = tmp9 - tmp10
    tmp13 = 9.964923469387754e-06
    tmp14 = tmp12 * tmp13
    tmp16 = tmp15 * tmp15
    tmp17 = tmp14 * tmp16
    tmp18 = tmp11 * tmp17
    tmp19 = tmp7 - tmp18
    tmp21 = tmp20 * tmp13
    tmp22 = tmp19 - tmp21
    tmp24 = tmp15 * tmp23
    tmp25 = tmp22 * tmp24
    tmp26 = tmp25.to(tl.float32)
    tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp26, None)


def get_args():
    arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_5 = rand_strided((1, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
    arg_9 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
    arg_10 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, arg_10,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_181.run(*args, 5619712, grid=grid(5619712), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_181.benchmark_all_configs(*args, 5619712, grid=grid(5619712))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ss/cssgcqqf6ombucovh6ytzd2ykd72qsuomzscm44okrt6cxuozbc5.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_752
triton_poi_fused__to_copy_182 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_182(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 13440
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((56, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((56, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_182.run(*args, 13440, grid=grid(13440), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_182.benchmark_all_configs(*args, 13440, grid=grid(13440))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/of/cofgkmtacwz7qwl7fcbig7f7isuaew6fzcwmo2bioxxyxqipb2vy.py
# Original ATen: aten.mul, aten.sigmoid, aten.sigmoid_backward, aten.silu, aten.sum

# aten.mul => mul_1088
# aten.sigmoid => sigmoid_3
# aten.sigmoid_backward => convert_element_type_753, convert_element_type_754, convert_element_type_755, mul_1090, mul_1091, sub_306
# aten.silu => convert_element_type_45, convert_element_type_46, mul_78, sigmoid_1
# aten.sum => sum_141
triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_183 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[32768, 1024],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]}
)
@triton.jit
def triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_183(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 30720
    rnumel = 784
    RBLOCK: tl.constexpr = 1024
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (r1 + (784*x0)), rmask, other=0).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (r1 + (784*x0)), rmask, other=0).to(tl.float32)
    tmp11 = tl.load(in_ptr2 + (x0), None).to(tl.float32)
    tmp2 = tmp1.to(tl.float32)
    tmp3 = tl.sigmoid(tmp2)
    tmp4 = tmp2 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tmp6 = tmp0 * tmp5
    tmp8 = tl.where(rmask, tmp6, 0)
    tmp9 = tl.sum(tmp8, 1)[:, None]
    tmp10 = tmp9.to(tl.float32)
    tmp12 = tl.sigmoid(tmp11)
    tmp13 = tmp12.to(tl.float32)
    tmp14 = 1.0
    tmp15 = tmp14 - tmp13
    tmp16 = tmp13 * tmp15
    tmp17 = tmp10 * tmp16
    tmp18 = tmp17.to(tl.float32)
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp18, None)


def get_args():
    arg_0 = rand_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_183.run(*args, 30720, 784, grid=grid(30720), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_183.benchmark_all_configs(*args, 30720, 784, grid=grid(30720))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/2a/c2ahf72g5w6o4b2o5wzvh7vjnff55vwxvyrwbchmf6do2jwrknlq.py
# Original ATen: aten._to_copy, aten.convolution_backward

# aten._to_copy => convert_element_type_757
# aten.convolution_backward => sum_142
triton_per_fused__to_copy_convolution_backward_184 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[256, 128],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__to_copy_convolution_backward_184(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 240
    rnumel = 128
    RBLOCK: tl.constexpr = 128
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (240*r1)), rmask & xmask, other=0).to(tl.float32)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)


def get_args():
    arg_0 = rand_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__to_copy_convolution_backward_184.run(*args, 240, 128, grid=grid(240), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__to_copy_convolution_backward_184.benchmark_all_configs(*args, 240, 128, grid=grid(240))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/qt/cqtwdt25yvairvvorg5m6gz2cdhgchboky6qw4dfwm5pyezhyucw.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_756
triton_poi_fused__to_copy_185 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_185(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 4800
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((240, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((240, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_185.run(*args, 4800, grid=grid(4800), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_185.benchmark_all_configs(*args, 4800, grid=grid(4800))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/wr/cwrjuglpb4gznmuaxv7k3zlp7pbp5w2juaztlowvwrq4a5mtkqsj.py
# Original ATen: aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub

# aten.add => add_376
# aten.clone => clone_2
# aten.fill => full_like_45
# aten.mul => mul_1092, mul_1093, mul_1094
# aten.sigmoid => sigmoid_109
# aten.sub => sub_307
triton_poi_fused_add_clone_fill_mul_sigmoid_sub_186 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_add_clone_fill_mul_sigmoid_sub_186(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2560
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = 1.0
    tmp4 = tmp3 - tmp2
    tmp5 = tmp1 * tmp4
    tmp6 = tmp5 + tmp3
    tmp7 = tmp2 * tmp6
    tmp8 = tmp0 * tmp7
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp8, xmask)


def get_args():
    arg_0 = rand_strided((128, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_add_clone_fill_mul_sigmoid_sub_186.run(*args, 2560, grid=grid(2560), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_add_clone_fill_mul_sigmoid_sub_186.benchmark_all_configs(*args, 2560, grid=grid(2560))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/fl/cflhjweuqyd43bdk6ghk435lvc4aws6crjlokqtpnsbzf7ks6xms.py
# Original ATen: aten._to_copy, aten.convolution_backward

# aten._to_copy => convert_element_type_759
# aten.convolution_backward => sum_143
triton_per_fused__to_copy_convolution_backward_187 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[32, 128],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__to_copy_convolution_backward_187(in_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 20
    rnumel = 128
    RBLOCK: tl.constexpr = 128
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (20*r1)), rmask & xmask, other=0).to(tl.float32)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)


def get_args():
    arg_0 = rand_strided((128, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((20,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__to_copy_convolution_backward_187.run(*args, 20, 128, grid=grid(20), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__to_copy_convolution_backward_187.benchmark_all_configs(*args, 20, 128, grid=grid(20))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/a2/ca25x7sc4v36iwhjlvkmxifjmfddvl7462a5hlf5ax6a3bk4qpxv.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_758
triton_poi_fused__to_copy_188 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_188(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 4800
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((20, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((20, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_188.run(*args, 4800, grid=grid(4800), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_188.benchmark_all_configs(*args, 4800, grid=grid(4800))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/6h/c6hub7cmnc2lps6dhzvt7jhvxrb62mqi6pacmt47atzmnjv2ubac.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub

# aten._native_batch_norm_legit_functional => convert_element_type_43
# aten.add => add_377, add_378
# aten.clone => clone_1
# aten.div => div_16
# aten.fill => full_like_46
# aten.mul => mul_1089, mul_1095, mul_1096, mul_1097
# aten.native_batch_norm_backward => convert_element_type_760, mul_1098, mul_1106, sub_309, sum_144, sum_145
# aten.sigmoid => sigmoid_110, sigmoid_3
# aten.sub => sub_308
triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_189 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[256, 131072],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_189(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 240
    rnumel = 100352
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    tmp20 = tl.load(in_ptr5 + (x0), xmask)
    _tmp23 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 784
        r2 = (rindex // 784)
        tmp0 = tl.load(in_ptr0 + (r1 + (784*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tl.load(in_ptr1 + (x0 + (240*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp4 = tl.load(in_ptr2 + (x0 + (240*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp8 = tl.load(in_ptr3 + (r1 + (784*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp18 = tl.load(in_ptr4 + (r1 + (784*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp2 = tl.sigmoid(tmp1)
        tmp3 = tmp0 * tmp2
        tmp5 = 784.0
        tmp6 = tmp4 / tmp5
        tmp7 = tmp3 + tmp6
        tmp9 = tl.sigmoid(tmp8)
        tmp10 = 1.0
        tmp11 = tmp10 - tmp9
        tmp12 = tmp8 * tmp11
        tmp13 = tmp12 + tmp10
        tmp14 = tmp9 * tmp13
        tmp15 = tmp7 * tmp14
        tmp16 = tmp15.to(tl.float32)
        _tmp17 = tl.where(rmask & xmask, _tmp17 + tmp16, _tmp17)
        tmp19 = tmp18.to(tl.float32)
        tmp21 = tmp19 - tmp20
        tmp22 = tmp16 * tmp21
        _tmp23 = tl.where(rmask & xmask, _tmp23 + tmp22, _tmp23)
    tmp17 = tl.sum(_tmp17, 1)[:, None]
    tl.store(out_ptr0 + x0, tmp17, xmask)
    tmp23 = tl.sum(_tmp23, 1)[:, None]
    tl.store(out_ptr1 + x0, tmp23, xmask)
    tmp24 = tl.load(in_ptr6 + (x0), xmask)
    tmp25 = tmp23 * tmp24
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp25, xmask)


def get_args():
    arg_0 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_5 = rand_strided((1, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_9 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_189.run(*args, 240, 100352, grid=grid(240), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_189.benchmark_all_configs(*args, 240, 100352, grid=grid(240))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/b4/cb4sfko255cmn5essh46qplwjuzpmcv7vbfk26rbkic7p6z2ckjv.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.div, aten.fill, aten.mul, aten.native_batch_norm_backward, aten.sigmoid, aten.sub

# aten._native_batch_norm_legit_functional => convert_element_type_43
# aten.add => add_377, add_378
# aten.clone => clone_1
# aten.div => div_16
# aten.fill => full_like_46
# aten.mul => mul_1089, mul_1095, mul_1096, mul_1097
# aten.native_batch_norm_backward => convert_element_type_760, mul_1104, sub_309, sub_311, sub_312
# aten.sigmoid => sigmoid_110, sigmoid_3
# aten.sub => sub_308
triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_190 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp16', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_190(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 24084480
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x4 = (xindex // 784)
    x1 = (xindex // 784) % 240
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (x4), None).to(tl.float32)
    tmp4 = tl.load(in_ptr2 + (x4), None).to(tl.float32)
    tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32)
    tmp17 = tl.load(in_ptr4 + (x3), None).to(tl.float32)
    tmp19 = tl.load(in_ptr5 + (x1), None)
    tmp21 = tl.load(in_ptr6 + (x1), None)
    tmp24 = tl.load(in_ptr7 + (x1), None)
    tmp29 = tl.load(in_ptr8 + (x1), None)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp0 * tmp2
    tmp5 = 784.0
    tmp6 = tmp4 / tmp5
    tmp7 = tmp3 + tmp6
    tmp9 = tl.sigmoid(tmp8)
    tmp10 = 1.0
    tmp11 = tmp10 - tmp9
    tmp12 = tmp8 * tmp11
    tmp13 = tmp12 + tmp10
    tmp14 = tmp9 * tmp13
    tmp15 = tmp7 * tmp14
    tmp16 = tmp15.to(tl.float32)
    tmp18 = tmp17.to(tl.float32)
    tmp20 = tmp18 - tmp19
    tmp22 = 9.964923469387754e-06
    tmp23 = tmp21 * tmp22
    tmp25 = tmp24 * tmp24
    tmp26 = tmp23 * tmp25
    tmp27 = tmp20 * tmp26
    tmp28 = tmp16 - tmp27
    tmp30 = tmp29 * tmp22
    tmp31 = tmp28 - tmp30
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp31, None)


def get_args():
    arg_0 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_5 = rand_strided((1, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_9 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_190.run(*args, 24084480, grid=grid(24084480), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_190.benchmark_all_configs(*args, 24084480, grid=grid(24084480))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/64/c64lskcy7vo32z6iwoy7sbqjaz3shz2zgjiddxnernpnf42f5kgu.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_135
triton_poi_fused_convolution_backward_191 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_191(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 6021120
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = (xindex // 47040)
    x3 = xindex % 47040
    x1 = (xindex // 784) % 60
    x4 = xindex
    tmp0 = tl.load(in_ptr0 + (141120 + x3 + (188160*x2)), None)
    tmp1 = tl.load(in_ptr1 + (180 + x1), None)
    tmp2 = tl.load(in_ptr2 + (180 + x1), None)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp0 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)


def get_args():
    arg_0 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((128, 60, 28, 28), (47040, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_191.run(*args, 6021120, grid=grid(6021120), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_191.benchmark_all_configs(*args, 6021120, grid=grid(6021120))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/q5/cq54zrcq5terxwt6k525p5rp3nkt2xgc6fiu5byc3io2seruy4pn.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_763
triton_poi_fused__to_copy_192 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_192(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 4860
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((60, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((60, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_192.run(*args, 4860, grid=grid(4860), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_192.benchmark_all_configs(*args, 4860, grid=grid(4860))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/br/cbrxmq6ibqs3puj5yvt2cuwvf6skj4stjxuuq7wvmk57lhi6a5vj.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_136
triton_poi_fused_convolution_backward_193 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_193(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 6021120
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = (xindex // 47040)
    x3 = xindex % 47040
    x1 = (xindex // 784) % 60
    x4 = xindex
    tmp0 = tl.load(in_ptr0 + (94080 + x3 + (188160*x2)), None)
    tmp1 = tl.load(in_ptr1 + (120 + x1), None)
    tmp2 = tl.load(in_ptr2 + (120 + x1), None)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp0 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)


def get_args():
    arg_0 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((128, 60, 28, 28), (47040, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_193.run(*args, 6021120, grid=grid(6021120), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_193.benchmark_all_configs(*args, 6021120, grid=grid(6021120))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/3b/c3bhkoe2gl2ylb7b67k4e5uph44ovk7dmdflgyqj7mj6bgzsfash.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_764
triton_poi_fused__to_copy_194 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_194(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2940
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((60, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((60, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_194.run(*args, 2940, grid=grid(2940), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_194.benchmark_all_configs(*args, 2940, grid=grid(2940))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/na/cnasnkl2z4au7fpcqaxw5gcnnevjoribcdlpepbjqn6x74n4itra.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_137
triton_poi_fused_convolution_backward_195 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_195(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 6021120
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = (xindex // 47040)
    x3 = xindex % 47040
    x1 = (xindex // 784) % 60
    x4 = xindex
    tmp0 = tl.load(in_ptr0 + (47040 + x3 + (188160*x2)), None)
    tmp1 = tl.load(in_ptr1 + (60 + x1), None)
    tmp2 = tl.load(in_ptr2 + (60 + x1), None)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp0 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)


def get_args():
    arg_0 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((128, 60, 28, 28), (47040, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_195.run(*args, 6021120, grid=grid(6021120), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_195.benchmark_all_configs(*args, 6021120, grid=grid(6021120))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/zs/czsqzrqr2u3q2yqg57a3bpoxcvbsw7bfuy7uqz7stgjemnm27ezs.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_765
triton_poi_fused__to_copy_196 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_196(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1500
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((60, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((60, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_196.run(*args, 1500, grid=grid(1500), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_196.benchmark_all_configs(*args, 1500, grid=grid(1500))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/nq/cnqeg2qpm6hnr3euqz4jxglt3pvrermthkbedv7k5wva44ox64xi.py
# Original ATen: aten.convolution_backward

# aten.convolution_backward => convolution_backward_138
triton_poi_fused_convolution_backward_197 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp16', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]})
@triton.jit
def triton_poi_fused_convolution_backward_197(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 6021120
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = (xindex // 47040)
    x3 = xindex % 47040
    x1 = (xindex // 784) % 60
    x4 = xindex
    tmp0 = tl.load(in_ptr0 + (x3 + (188160*x2)), None)
    tmp1 = tl.load(in_ptr1 + (x1), None)
    tmp2 = tl.load(in_ptr2 + (x1), None)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp0 * tmp3
    tmp5 = tmp4.to(tl.float32)
    tl.store(out_ptr0 + (x4 + tl.zeros([XBLOCK], tl.int32)), tmp5, None)


def get_args():
    arg_0 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((128, 60, 28, 28), (47040, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_convolution_backward_197.run(*args, 6021120, grid=grid(6021120), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_convolution_backward_197.benchmark_all_configs(*args, 6021120, grid=grid(6021120))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/kv/ckv7rs4yhome24evqokglvw3vjls25y3ssixj4dq3f7ptqybjph2.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_766
triton_poi_fused__to_copy_198 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_198(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 540
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((60, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((60, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_198.run(*args, 540, grid=grid(540), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_198.benchmark_all_configs(*args, 540, grid=grid(540))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/zk/czkotusdghmpwd5fnwj7jdl2hmku2xiuhyw3zvizyi423stddlwd.py
# Original ATen: aten.cat

# aten.cat => cat_76
triton_poi_fused_cat_199 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_199(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 24084480
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 188160
    x1 = (xindex // 188160)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (752640*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 60, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_199.run(*args, 24084480, grid=grid(24084480), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_199.benchmark_all_configs(*args, 24084480, grid=grid(24084480))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/yq/cyqk376sl4hj7jr6ig7md3emmwajlxo5ngwitzl6ke74qer4vlzp.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mul, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_35
# aten.mul => mul_1109
# aten.native_batch_norm_backward => convert_element_type_767, mul_1110, mul_1118, sub_314, sum_146, sum_147
triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_200 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[256, 524288],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_200(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 240
    rnumel = 401408
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp4 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    tmp7 = tl.load(in_ptr3 + (x0), xmask)
    _tmp10 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 3136
        r2 = (rindex // 3136)
        tmp0 = tl.load(in_ptr0 + (r1 + (3136*x0) + (752640*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tl.load(in_ptr1 + (r1 + (3136*x0) + (752640*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp5 = tl.load(in_ptr2 + (r1 + (3136*x0) + (752640*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp2 = tmp0 * tmp1
        tmp3 = tmp2.to(tl.float32)
        _tmp4 = tl.where(rmask & xmask, _tmp4 + tmp3, _tmp4)
        tmp6 = tmp5.to(tl.float32)
        tmp8 = tmp6 - tmp7
        tmp9 = tmp3 * tmp8
        _tmp10 = tl.where(rmask & xmask, _tmp10 + tmp9, _tmp10)
    tmp4 = tl.sum(_tmp4, 1)[:, None]
    tl.store(out_ptr0 + x0, tmp4, xmask)
    tmp10 = tl.sum(_tmp10, 1)[:, None]
    tl.store(out_ptr1 + x0, tmp10, xmask)
    tmp11 = tl.load(in_ptr4 + (x0), xmask)
    tmp12 = tmp10 * tmp11
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp12, xmask)


def get_args():
    arg_0 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_200.run(*args, 240, 401408, grid=grid(240), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_200.benchmark_all_configs(*args, 240, 401408, grid=grid(240))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/yh/cyhr4pui7cexlauvjpzunq3nad46e54iw5xeuekil5rzao2n4cb5.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.convolution_backward, aten.mul, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_35
# aten.convolution_backward => convolution_backward_139
# aten.mul => mul_1109
# aten.native_batch_norm_backward => convert_element_type_767, convert_element_type_769, mul_1116, mul_1117, sub_314, sub_316, sub_317
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_201 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[134217728], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_201(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr):
    xnumel = 96337920
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 3136) % 240
    tmp0 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32)
    tmp1 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp4 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
    tmp6 = tl.load(in_ptr2 + (x1), None)
    tmp8 = tl.load(in_ptr3 + (x1), None)
    tmp11 = tl.load(in_ptr4 + (x1), None)
    tmp16 = tl.load(in_ptr5 + (x1), None)
    tmp19 = tl.load(in_ptr6 + (x1), None)
    tmp2 = tmp0 * tmp1
    tmp3 = tmp2.to(tl.float32)
    tmp5 = tmp4.to(tl.float32)
    tmp7 = tmp5 - tmp6
    tmp9 = 2.4912308673469386e-06
    tmp10 = tmp8 * tmp9
    tmp12 = tmp11 * tmp11
    tmp13 = tmp10 * tmp12
    tmp14 = tmp7 * tmp13
    tmp15 = tmp3 - tmp14
    tmp17 = tmp16 * tmp9
    tmp18 = tmp15 - tmp17
    tmp20 = tmp11 * tmp19
    tmp21 = tmp18 * tmp20
    tmp22 = tmp21.to(tl.float32)
    tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None)


def get_args():
    arg_0 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_201.run(*args, 96337920, grid=grid(96337920), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_201.benchmark_all_configs(*args, 96337920, grid=grid(96337920))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/wr/cwro3jdt4c7rexhki7lqmrop6cuwcmj4fk55z26tcv23lykknwc7.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_770
triton_poi_fused__to_copy_202 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_202(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 9600
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((240, 40, 1, 1), (40, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((240, 40, 1, 1), (40, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_202.run(*args, 9600, grid=grid(9600), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_202.benchmark_all_configs(*args, 9600, grid=grid(9600))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/us/cusjbed3asrxzp3okymq5zoehzdhdnueva2q2mrwfkbmmsa7vt2l.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_32
# aten.native_batch_norm_backward => convert_element_type_771, mul_1119, sub_318, sum_148, sum_149
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_203 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[1024, 32768],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_203(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 520
    rnumel = 30878
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x1 = (xindex // 40)
    x0 = xindex % 40
    _tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    _tmp13 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = r2 + (30878*x1)
        tmp1 = 401408
        tmp2 = tmp0 < tmp1
        tmp3 = tl.load(in_ptr0 + ((3136*x0) + (125440*(((r2 + (30878*x1)) // 3136) % 128)) + ((r2 + (30878*x1)) % 3136) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp4 = tmp3.to(tl.float32)
        tmp5 = tl.where(tmp2, tmp4, 0)
        _tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6)
        tmp7 = tl.load(in_ptr1 + ((3136*x0) + (125440*(((r2 + (30878*x1)) // 3136) % 128)) + ((r2 + (30878*x1)) % 3136) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp8 = tmp7.to(tl.float32)
        tmp9 = tl.load(in_ptr2 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0)
        tmp10 = tmp8 - tmp9
        tmp11 = tmp4 * tmp10
        tmp12 = tl.where(tmp2, tmp11, 0)
        _tmp13 = tl.where(rmask & xmask, _tmp13 + tmp12, _tmp13)
    tmp6 = tl.sum(_tmp6, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp6, xmask)
    tmp13 = tl.sum(_tmp13, 1)[:, None]
    tl.store(out_ptr1 + x3, tmp13, xmask)


def get_args():
    arg_0 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((1, 40, 1, 1), (40, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((40, 13), (1, 40), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((40, 13), (1, 40), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_203.run(*args, 520, 30878, grid=grid(520), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_203.benchmark_all_configs(*args, 520, 30878, grid=grid(520))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/qg/cqg6aa46opip4hczjffhjwy2jut2oaxxzpy4j2v3w3jdazqvqeyy.py
# Original ATen: aten.native_batch_norm_backward

# aten.native_batch_norm_backward => convert_element_type_771, sum_148
triton_per_fused_native_batch_norm_backward_204 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[64, 16],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}
)
@triton.jit
def triton_per_fused_native_batch_norm_backward_204(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 40
    rnumel = 13
    RBLOCK: tl.constexpr = 16
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (40*r1)), rmask & xmask, other=0)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tl.store(out_ptr0 + x0, tmp3, xmask)


def get_args():
    arg_0 = rand_strided((40, 13), (1, 40), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused_native_batch_norm_backward_204.run(*args, 40, 13, grid=grid(40), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused_native_batch_norm_backward_204.benchmark_all_configs(*args, 40, 13, grid=grid(40))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/w5/cw57xxiivmhsirfvmevlwba5lmcxoxn7w2ijdenksrtwpezynqel.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_32
# aten.native_batch_norm_backward => convert_element_type_771, mul_1119, mul_1127, sub_318, sum_149
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_205 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[64, 16],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_205(in_ptr0, in_ptr1, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 40
    rnumel = 13
    RBLOCK: tl.constexpr = 16
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (40*r1)), rmask & xmask, other=0)
    tmp4 = tl.load(in_ptr1 + (x0), xmask)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp5 = tmp3 * tmp4
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
    tl.store(out_ptr0 + x0, tmp3, xmask)


def get_args():
    arg_0 = rand_strided((40, 13), (1, 40), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_205.run(*args, 40, 13, grid=grid(40), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_205.benchmark_all_configs(*args, 40, 13, grid=grid(40))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/dv/cdvvwqm7hd5zalycntmifgkymej3zfk5pkr7u7cto2iazs4jivjb.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_32
# aten.native_batch_norm_backward => convert_element_type_771, convert_element_type_773, mul_1125, mul_1126, sub_318, sub_320, sub_321
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_206 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp16', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_206(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 16056320
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 3136) % 40
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
    tmp4 = tl.load(in_ptr2 + (x1), None)
    tmp6 = tl.load(in_ptr3 + (x1), None)
    tmp9 = tl.load(in_ptr4 + (x1), None)
    tmp14 = tl.load(in_ptr5 + (x1), None)
    tmp17 = tl.load(in_ptr6 + (x1), None)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp2.to(tl.float32)
    tmp5 = tmp3 - tmp4
    tmp7 = 2.4912308673469386e-06
    tmp8 = tmp6 * tmp7
    tmp10 = tmp9 * tmp9
    tmp11 = tmp8 * tmp10
    tmp12 = tmp5 * tmp11
    tmp13 = tmp1 - tmp12
    tmp15 = tmp14 * tmp7
    tmp16 = tmp13 - tmp15
    tmp18 = tmp9 * tmp17
    tmp19 = tmp16 * tmp18
    tmp20 = tmp19.to(tl.float32)
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp20, None)


def get_args():
    arg_0 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((1, 40, 1, 1), (40, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_206.run(*args, 16056320, grid=grid(16056320), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_206.benchmark_all_configs(*args, 16056320, grid=grid(16056320))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/vc/cvcxf7i3sst3omwctckvgjvv4eozcbdv5gjqefaeqscrapf62for.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_774
triton_poi_fused__to_copy_207 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_207(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1200
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((20, 60, 1, 1), (60, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((20, 60, 1, 1), (60, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_207.run(*args, 1200, grid=grid(1200), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_207.benchmark_all_configs(*args, 1200, grid=grid(1200))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/lf/clf4irjcvwl42oxnz5c33v4neup72mhsx7mu2i6zkgdhpnxli2bq.py
# Original ATen: aten.cat

# aten.cat => cat_77
triton_poi_fused_cat_208 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_208(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 24084480
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 188160
    x1 = (xindex // 188160)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (376320*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 60, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_208.run(*args, 24084480, grid=grid(24084480), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_208.benchmark_all_configs(*args, 24084480, grid=grid(24084480))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/66/c66uzpn2lu6yurhfwgr2ruj4qag4lzfc5mizci4jocjyt6zpqdbn.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward, aten.threshold_backward

# aten._native_batch_norm_legit_functional => convert_element_type_28
# aten.native_batch_norm_backward => convert_element_type_776, mul_1128, sub_322, sum_150, sum_151
# aten.threshold_backward => scalar_tensor, where_1
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_209 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[512, 131072],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*i1', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: 'i32', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_209(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 480
    rnumel = 100352
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex % 120
    x1 = (xindex // 120)
    _tmp5 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    tmp8 = tl.load(in_ptr3 + (x0), xmask)
    _tmp11 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = tl.load(in_ptr0 + ((3136*x0) + (376320*(r2 // 3136)) + (12042240*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last')
        tmp2 = tl.load(in_ptr1 + ((3136*x0) + (376320*(r2 // 3136)) + (12042240*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp6 = tl.load(in_ptr2 + ((3136*x0) + (376320*(r2 // 3136)) + (12042240*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = 0.0
        tmp3 = tl.where(tmp0, tmp1, tmp2)
        tmp4 = tmp3.to(tl.float32)
        _tmp5 = tl.where(rmask & xmask, _tmp5 + tmp4, _tmp5)
        tmp7 = tmp6.to(tl.float32)
        tmp9 = tmp7 - tmp8
        tmp10 = tmp4 * tmp9
        _tmp11 = tl.where(rmask & xmask, _tmp11 + tmp10, _tmp11)
    tmp5 = tl.sum(_tmp5, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp5, xmask)
    tmp11 = tl.sum(_tmp11, 1)[:, None]
    tl.store(out_ptr1 + x3, tmp11, xmask)


def get_args():
    arg_0 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.bool)
    arg_1 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 120, 1, 1), (120, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((120, 4), (1, 120), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((120, 4), (1, 120), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_209.run(*args, 480, 100352, grid=grid(480), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_209.benchmark_all_configs(*args, 480, 100352, grid=grid(480))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/g3/cg3eh3z2tmue4l5xmjpqccp3tic4aro7pb346j3pv4ncaujbyl3q.py
# Original ATen: aten.native_batch_norm_backward, aten.threshold_backward

# aten.native_batch_norm_backward => convert_element_type_776, sum_150
# aten.threshold_backward => scalar_tensor, where_1
triton_per_fused_native_batch_norm_backward_threshold_backward_210 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[128, 4],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}
)
@triton.jit
def triton_per_fused_native_batch_norm_backward_threshold_backward_210(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 120
    rnumel = 4
    RBLOCK: tl.constexpr = 4
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (120*r1)), rmask & xmask, other=0)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tl.store(out_ptr0 + x0, tmp3, xmask)


def get_args():
    arg_0 = rand_strided((120, 4), (1, 120), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused_native_batch_norm_backward_threshold_backward_210.run(*args, 120, 4, grid=grid(120), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused_native_batch_norm_backward_threshold_backward_210.benchmark_all_configs(*args, 120, 4, grid=grid(120))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/km/ckmhads4vxru2lkra3kkkolwnx3zpb76bwogj7vxbrxuac26qgvv.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward, aten.threshold_backward

# aten._native_batch_norm_legit_functional => convert_element_type_28
# aten.native_batch_norm_backward => convert_element_type_776, mul_1128, mul_1136, sub_322, sum_151
# aten.threshold_backward => scalar_tensor, where_1
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_211 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[128, 4],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_211(in_ptr0, in_ptr1, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 120
    rnumel = 4
    RBLOCK: tl.constexpr = 4
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (120*r1)), rmask & xmask, other=0)
    tmp4 = tl.load(in_ptr1 + (x0), xmask)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp5 = tmp3 * tmp4
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
    tl.store(out_ptr0 + x0, tmp3, xmask)


def get_args():
    arg_0 = rand_strided((120, 4), (1, 120), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_211.run(*args, 120, 4, grid=grid(120), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_211.benchmark_all_configs(*args, 120, 4, grid=grid(120))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/xl/cxle66ypftv4la7qrvf43secbqldxij7psgczidhyjvaxrz7kzkz.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.convolution_backward, aten.native_batch_norm_backward, aten.threshold_backward

# aten._native_batch_norm_legit_functional => convert_element_type_28
# aten.convolution_backward => convolution_backward_142
# aten.native_batch_norm_backward => convert_element_type_776, convert_element_type_778, mul_1134, mul_1135, sub_322, sub_324, sub_325
# aten.threshold_backward => scalar_tensor, where_1
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_native_batch_norm_backward_threshold_backward_212 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*i1', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_native_batch_norm_backward_threshold_backward_212(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr):
    xnumel = 48168960
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 3136) % 120
    tmp0 = tl.load(in_ptr0 + (x3), None)
    tmp2 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32)
    tmp5 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
    tmp7 = tl.load(in_ptr2 + (x1), None)
    tmp9 = tl.load(in_ptr3 + (x1), None)
    tmp12 = tl.load(in_ptr4 + (x1), None)
    tmp17 = tl.load(in_ptr5 + (x1), None)
    tmp20 = tl.load(in_ptr6 + (x1), None)
    tmp1 = 0.0
    tmp3 = tl.where(tmp0, tmp1, tmp2)
    tmp4 = tmp3.to(tl.float32)
    tmp6 = tmp5.to(tl.float32)
    tmp8 = tmp6 - tmp7
    tmp10 = 2.4912308673469386e-06
    tmp11 = tmp9 * tmp10
    tmp13 = tmp12 * tmp12
    tmp14 = tmp11 * tmp13
    tmp15 = tmp8 * tmp14
    tmp16 = tmp4 - tmp15
    tmp18 = tmp17 * tmp10
    tmp19 = tmp16 - tmp18
    tmp21 = tmp12 * tmp20
    tmp22 = tmp19 * tmp21
    tmp23 = tmp22.to(tl.float32)
    tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp23, None)


def get_args():
    arg_0 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.bool)
    arg_2 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 120, 1, 1), (120, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_native_batch_norm_backward_threshold_backward_212.run(*args, 48168960, grid=grid(48168960), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_native_batch_norm_backward_threshold_backward_212.benchmark_all_configs(*args, 48168960, grid=grid(48168960))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/qz/cqzek4z2thr6mvast3kh2b2tldjmwhpsef2pyz5zmusldwkpkngu.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward, aten.threshold_backward

# aten._native_batch_norm_legit_functional => convert_element_type_25
# aten.native_batch_norm_backward => convert_element_type_780, mul_1137, sub_326, sum_152, sum_153
# aten.threshold_backward => le_2, scalar_tensor, where_2
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_213 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[512, 131072],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: 'i32', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_213(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 480
    rnumel = 100352
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex % 120
    x1 = (xindex // 120)
    _tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    tmp9 = tl.load(in_ptr3 + (x0), xmask)
    _tmp12 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = tl.load(in_ptr0 + ((3136*x0) + (376320*(r2 // 3136)) + (12042240*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp3 = tl.load(in_ptr1 + ((3136*x0) + (376320*(r2 // 3136)) + (12042240*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp7 = tl.load(in_ptr2 + ((3136*x0) + (376320*(r2 // 3136)) + (12042240*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = 0.0
        tmp2 = tmp0 <= tmp1
        tmp4 = tl.where(tmp2, tmp1, tmp3)
        tmp5 = tmp4.to(tl.float32)
        _tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6)
        tmp8 = tmp7.to(tl.float32)
        tmp10 = tmp8 - tmp9
        tmp11 = tmp5 * tmp10
        _tmp12 = tl.where(rmask & xmask, _tmp12 + tmp11, _tmp12)
    tmp6 = tl.sum(_tmp6, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp6, xmask)
    tmp12 = tl.sum(_tmp12, 1)[:, None]
    tl.store(out_ptr1 + x3, tmp12, xmask)


def get_args():
    arg_0 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 120, 1, 1), (120, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((120, 4), (1, 120), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((120, 4), (1, 120), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_213.run(*args, 480, 100352, grid=grid(480), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_213.benchmark_all_configs(*args, 480, 100352, grid=grid(480))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/2e/c2enjqp5awzvfk4uybzbydnq3q4m7rptxovs2flh22mf4fsrv3do.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward, aten.threshold_backward

# aten._native_batch_norm_legit_functional => convert_element_type_25
# aten.native_batch_norm_backward => convert_element_type_780, convert_element_type_782, mul_1143, mul_1144, sub_326, sub_328, sub_329
# aten.threshold_backward => le_2, scalar_tensor, where_2
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_214 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_214(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr):
    xnumel = 48168960
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 3136) % 120
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp3 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32)
    tmp6 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
    tmp8 = tl.load(in_ptr2 + (x1), None)
    tmp10 = tl.load(in_ptr3 + (x1), None)
    tmp13 = tl.load(in_ptr4 + (x1), None)
    tmp18 = tl.load(in_ptr5 + (x1), None)
    tmp21 = tl.load(in_ptr6 + (x1), None)
    tmp1 = 0.0
    tmp2 = tmp0 <= tmp1
    tmp4 = tl.where(tmp2, tmp1, tmp3)
    tmp5 = tmp4.to(tl.float32)
    tmp7 = tmp6.to(tl.float32)
    tmp9 = tmp7 - tmp8
    tmp11 = 2.4912308673469386e-06
    tmp12 = tmp10 * tmp11
    tmp14 = tmp13 * tmp13
    tmp15 = tmp12 * tmp14
    tmp16 = tmp9 * tmp15
    tmp17 = tmp5 - tmp16
    tmp19 = tmp18 * tmp11
    tmp20 = tmp17 - tmp19
    tmp22 = tmp13 * tmp21
    tmp23 = tmp20 * tmp22
    tmp24 = tmp23.to(tl.float32)
    tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp24, None)


def get_args():
    arg_0 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 120, 1, 1), (120, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_214.run(*args, 48168960, grid=grid(48168960), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_214.benchmark_all_configs(*args, 48168960, grid=grid(48168960))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/3v/c3v2rhl4qsc7fw7fs2xmnkmc35qejrs64lo33ym5g2mwij4wsjpo.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_783
triton_poi_fused__to_copy_215 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_215(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1200
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((60, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((60, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_215.run(*args, 1200, grid=grid(1200), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_215.benchmark_all_configs(*args, 1200, grid=grid(1200))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/lt/cltkzmq5awdw5gjenomtexkpai5uaptb73szgcptdcapnmaagqxc.py
# Original ATen: aten.cat

# aten.cat => cat_78
triton_poi_fused_cat_216 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_216(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 8028160
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 62720
    x1 = (xindex // 62720)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (125440*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 20, 56, 56), (62720, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 20, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_216.run(*args, 8028160, grid=grid(8028160), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_216.benchmark_all_configs(*args, 8028160, grid=grid(8028160))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/fi/cfi243azbcb4gaimfcgujhtcrh3fg6wgryrmkebaxlazlenz7yvt.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_21
# aten.add => add_380
# aten.native_batch_norm_backward => convert_element_type_785, mul_1146, sub_330, sum_154, sum_155
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_217 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[1024, 32768],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: 'i32', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_217(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 520
    rnumel = 30878
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x1 = (xindex // 40)
    x0 = xindex % 40
    _tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    _tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = r2 + (30878*x1)
        tmp1 = 401408
        tmp2 = tmp0 < tmp1
        tmp3 = tl.load(in_ptr0 + ((3136*x0) + (125440*(((r2 + (30878*x1)) // 3136) % 128)) + ((r2 + (30878*x1)) % 3136) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp4 = tl.load(in_ptr1 + ((3136*x0) + (125440*(((r2 + (30878*x1)) // 3136) % 128)) + ((r2 + (30878*x1)) % 3136) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp5 = tmp3 + tmp4
        tmp6 = tmp5.to(tl.float32)
        tmp7 = tl.where(tmp2, tmp6, 0)
        _tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8)
        tmp9 = tl.load(in_ptr2 + ((3136*x0) + (125440*(((r2 + (30878*x1)) // 3136) % 128)) + ((r2 + (30878*x1)) % 3136) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp10 = tmp9.to(tl.float32)
        tmp11 = tl.load(in_ptr3 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0)
        tmp12 = tmp10 - tmp11
        tmp13 = tmp6 * tmp12
        tmp14 = tl.where(tmp2, tmp13, 0)
        _tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
    tmp8 = tl.sum(_tmp8, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp8, xmask)
    tmp15 = tl.sum(_tmp15, 1)[:, None]
    tl.store(out_ptr1 + x3, tmp15, xmask)


def get_args():
    arg_0 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 40, 1, 1), (40, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((40, 13), (1, 40), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((40, 13), (1, 40), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_217.run(*args, 520, 30878, grid=grid(520), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_217.benchmark_all_configs(*args, 520, 30878, grid=grid(520))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/lp/clp5ea4udz42bfruecjqk4ujvitf7447q7o62utqlt7g7q4jrw2p.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_21
# aten.add => add_380
# aten.native_batch_norm_backward => convert_element_type_785, convert_element_type_787, mul_1152, mul_1153, sub_330, sub_332, sub_333
triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_218 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_218(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr):
    xnumel = 16056320
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 3136) % 40
    tmp0 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32)
    tmp1 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp4 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
    tmp6 = tl.load(in_ptr2 + (x1), None)
    tmp8 = tl.load(in_ptr3 + (x1), None)
    tmp11 = tl.load(in_ptr4 + (x1), None)
    tmp16 = tl.load(in_ptr5 + (x1), None)
    tmp19 = tl.load(in_ptr6 + (x1), None)
    tmp2 = tmp0 + tmp1
    tmp3 = tmp2.to(tl.float32)
    tmp5 = tmp4.to(tl.float32)
    tmp7 = tmp5 - tmp6
    tmp9 = 2.4912308673469386e-06
    tmp10 = tmp8 * tmp9
    tmp12 = tmp11 * tmp11
    tmp13 = tmp10 * tmp12
    tmp14 = tmp7 * tmp13
    tmp15 = tmp3 - tmp14
    tmp17 = tmp16 * tmp9
    tmp18 = tmp15 - tmp17
    tmp20 = tmp11 * tmp19
    tmp21 = tmp18 * tmp20
    tmp22 = tmp21.to(tl.float32)
    tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp22, None)


def get_args():
    arg_0 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 40, 1, 1), (40, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_218.run(*args, 16056320, grid=grid(16056320), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_218.benchmark_all_configs(*args, 16056320, grid=grid(16056320))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ub/cubon5pg2aboclxsgfu5ogwsvehrz3yt2ht3ofzn74nj37pfe23y.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_788
triton_poi_fused__to_copy_219 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_219(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1920
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((20, 96, 1, 1), (96, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((20, 96, 1, 1), (96, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_219.run(*args, 1920, grid=grid(1920), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_219.benchmark_all_configs(*args, 1920, grid=grid(1920))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/k5/ck52swdvfg2q2o33v23iuvlzxjnegschzojpfprsk4betmydhemy.py
# Original ATen: aten.cat

# aten.cat => cat_79
triton_poi_fused_cat_220 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_220(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 38535168
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 301056
    x1 = (xindex // 301056)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (602112*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 96, 56, 56), (301056, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 96, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_220.run(*args, 38535168, grid=grid(38535168), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_220.benchmark_all_configs(*args, 38535168, grid=grid(38535168))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/sx/csxrwjlnsstdp2sod6mdyrq6fqpsozmjhqryoplixuev7a4zfvvs.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward, aten.threshold_backward

# aten._native_batch_norm_legit_functional => convert_element_type_17
# aten.native_batch_norm_backward => convert_element_type_790, mul_1155, sub_334, sum_156, sum_157
# aten.threshold_backward => scalar_tensor, where_3
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_221 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[1024, 131072],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*i1', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: 'i32', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_221(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 768
    rnumel = 100352
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex % 192
    x1 = (xindex // 192)
    _tmp5 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    tmp8 = tl.load(in_ptr3 + (x0), xmask)
    _tmp11 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = tl.load(in_ptr0 + ((3136*x0) + (602112*(r2 // 3136)) + (19267584*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last')
        tmp2 = tl.load(in_ptr1 + ((3136*x0) + (602112*(r2 // 3136)) + (19267584*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp6 = tl.load(in_ptr2 + ((3136*x0) + (602112*(r2 // 3136)) + (19267584*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = 0.0
        tmp3 = tl.where(tmp0, tmp1, tmp2)
        tmp4 = tmp3.to(tl.float32)
        _tmp5 = tl.where(rmask & xmask, _tmp5 + tmp4, _tmp5)
        tmp7 = tmp6.to(tl.float32)
        tmp9 = tmp7 - tmp8
        tmp10 = tmp4 * tmp9
        _tmp11 = tl.where(rmask & xmask, _tmp11 + tmp10, _tmp11)
    tmp5 = tl.sum(_tmp5, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp5, xmask)
    tmp11 = tl.sum(_tmp11, 1)[:, None]
    tl.store(out_ptr1 + x3, tmp11, xmask)


def get_args():
    arg_0 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.bool)
    arg_1 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 192, 1, 1), (192, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((192, 4), (1, 192), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((192, 4), (1, 192), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_221.run(*args, 768, 100352, grid=grid(768), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_221.benchmark_all_configs(*args, 768, 100352, grid=grid(768))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/cd/ccdek2lkvoxnmb7pygnnwx2hocehe2swy5lruknc256ae3z7ejqs.py
# Original ATen: aten.native_batch_norm_backward, aten.threshold_backward

# aten.native_batch_norm_backward => convert_element_type_790, sum_156
# aten.threshold_backward => scalar_tensor, where_3
triton_per_fused_native_batch_norm_backward_threshold_backward_222 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[256, 4],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}
)
@triton.jit
def triton_per_fused_native_batch_norm_backward_threshold_backward_222(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 192
    rnumel = 4
    RBLOCK: tl.constexpr = 4
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (192*r1)), rmask & xmask, other=0)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tl.store(out_ptr0 + x0, tmp3, xmask)


def get_args():
    arg_0 = rand_strided((192, 4), (1, 192), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused_native_batch_norm_backward_threshold_backward_222.run(*args, 192, 4, grid=grid(192), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused_native_batch_norm_backward_threshold_backward_222.benchmark_all_configs(*args, 192, 4, grid=grid(192))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/yq/cyqwa3yzhcmbcz6impuv4jmjxs3gz6jyzv6afsu3mcgdp62xtf44.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward, aten.threshold_backward

# aten._native_batch_norm_legit_functional => convert_element_type_17
# aten.native_batch_norm_backward => convert_element_type_790, mul_1155, mul_1163, sub_334, sum_157
# aten.threshold_backward => scalar_tensor, where_3
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_223 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[256, 4],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_223(in_ptr0, in_ptr1, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 192
    rnumel = 4
    RBLOCK: tl.constexpr = 4
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (192*r1)), rmask & xmask, other=0)
    tmp4 = tl.load(in_ptr1 + (x0), xmask)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp5 = tmp3 * tmp4
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
    tl.store(out_ptr0 + x0, tmp3, xmask)


def get_args():
    arg_0 = rand_strided((192, 4), (1, 192), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_223.run(*args, 192, 4, grid=grid(192), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_223.benchmark_all_configs(*args, 192, 4, grid=grid(192))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/tc/ctc3cbkradccw4562bnllxmu4ahgq5euwmbvdvguaplxo7grkqr2.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward, aten.threshold_backward

# aten._native_batch_norm_legit_functional => convert_element_type_17
# aten.native_batch_norm_backward => convert_element_type_790, convert_element_type_792, mul_1161, mul_1162, sub_334, sub_336, sub_337
# aten.threshold_backward => scalar_tensor, where_3
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_224 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[134217728], filename=__file__, meta={'signature': {0: '*fp16', 1: '*i1', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_224(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr):
    xnumel = 77070336
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 3136) % 192
    tmp0 = tl.load(in_ptr0 + (x3), None)
    tmp2 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32)
    tmp5 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
    tmp7 = tl.load(in_ptr2 + (x1), None)
    tmp9 = tl.load(in_ptr3 + (x1), None)
    tmp12 = tl.load(in_ptr4 + (x1), None)
    tmp17 = tl.load(in_ptr5 + (x1), None)
    tmp20 = tl.load(in_ptr6 + (x1), None)
    tmp1 = 0.0
    tmp3 = tl.where(tmp0, tmp1, tmp2)
    tmp4 = tmp3.to(tl.float32)
    tmp6 = tmp5.to(tl.float32)
    tmp8 = tmp6 - tmp7
    tmp10 = 2.4912308673469386e-06
    tmp11 = tmp9 * tmp10
    tmp13 = tmp12 * tmp12
    tmp14 = tmp11 * tmp13
    tmp15 = tmp8 * tmp14
    tmp16 = tmp4 - tmp15
    tmp18 = tmp17 * tmp10
    tmp19 = tmp16 - tmp18
    tmp21 = tmp12 * tmp20
    tmp22 = tmp19 * tmp21
    tmp23 = tmp22.to(tl.float32)
    tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp23, None)


def get_args():
    arg_0 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.bool)
    arg_2 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 192, 1, 1), (192, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_224.run(*args, 77070336, grid=grid(77070336), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_224.benchmark_all_configs(*args, 77070336, grid=grid(77070336))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ow/cowvpe4d4lhlyuz7t3xpn4aeadwdi4sc2kye37det3aqbobyu2ax.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_793
triton_poi_fused__to_copy_225 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_225(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3136
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((64, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((64, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_225.run(*args, 3136, grid=grid(3136), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_225.benchmark_all_configs(*args, 3136, grid=grid(3136))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/sk/cskbzrlfrwse6dagb7nizhjaccr2s25mcslu277prit4fw5smhxb.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_794
triton_poi_fused__to_copy_226 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_226(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1600
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((64, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((64, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_226.run(*args, 1600, grid=grid(1600), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_226.benchmark_all_configs(*args, 1600, grid=grid(1600))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/i3/ci3rhipt5njk6wniwbb724hbiv53ph2lumy42nnfncwqxny7w55l.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_795
triton_poi_fused__to_copy_227 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_227(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 576
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((64, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((64, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_227.run(*args, 576, grid=grid(576), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_227.benchmark_all_configs(*args, 576, grid=grid(576))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/pk/cpkbt4sa6m63phgjx2padg7pol23qjiy476umuxagu2pnouljl7g.py
# Original ATen: aten.cat

# aten.cat => cat_80
triton_poi_fused_cat_228 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[134217728], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_228(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 102760448
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 802816
    x1 = (xindex // 802816)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (2408448*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 64, 112, 112), (802816, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 64, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_228.run(*args, 102760448, grid=grid(102760448), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_228.benchmark_all_configs(*args, 102760448, grid=grid(102760448))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/iq/ciqjlz62phm2ticzaasbypqp5l7x2nk7uwgl64uxeslxjnvr5rhx.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward, aten.threshold_backward

# aten._native_batch_norm_legit_functional => convert_element_type_12
# aten.native_batch_norm_backward => convert_element_type_796, mul_1164, sub_338, sum_158, sum_159
# aten.threshold_backward => scalar_tensor, where_4
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_229 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[4096, 131072],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*i1', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: 'i32', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_229(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 2496
    rnumel = 123511
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x1 = (xindex // 192)
    x0 = xindex % 192
    _tmp9 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    _tmp16 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = r2 + (123511*x1)
        tmp1 = 1605632
        tmp2 = tmp0 < tmp1
        tmp3 = tl.load(in_ptr0 + ((12544*x0) + (2408448*(((r2 + (123511*x1)) // 12544) % 128)) + ((r2 + (123511*x1)) % 12544) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last')
        tmp4 = 0.0
        tmp5 = tl.load(in_ptr1 + ((12544*x0) + (2408448*(((r2 + (123511*x1)) // 12544) % 128)) + ((r2 + (123511*x1)) % 12544) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp6 = tl.where(tmp3, tmp4, tmp5)
        tmp7 = tmp6.to(tl.float32)
        tmp8 = tl.where(tmp2, tmp7, 0)
        _tmp9 = tl.where(rmask & xmask, _tmp9 + tmp8, _tmp9)
        tmp10 = tl.load(in_ptr2 + ((12544*x0) + (2408448*(((r2 + (123511*x1)) // 12544) % 128)) + ((r2 + (123511*x1)) % 12544) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp11 = tmp10.to(tl.float32)
        tmp12 = tl.load(in_ptr3 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0)
        tmp13 = tmp11 - tmp12
        tmp14 = tmp7 * tmp13
        tmp15 = tl.where(tmp2, tmp14, 0)
        _tmp16 = tl.where(rmask & xmask, _tmp16 + tmp15, _tmp16)
    tmp9 = tl.sum(_tmp9, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp9, xmask)
    tmp16 = tl.sum(_tmp16, 1)[:, None]
    tl.store(out_ptr1 + x3, tmp16, xmask)


def get_args():
    arg_0 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.bool)
    arg_1 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 192, 1, 1), (192, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((192, 13), (1, 192), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((192, 13), (1, 192), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_229.run(*args, 2496, 123511, grid=grid(2496), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_229.benchmark_all_configs(*args, 2496, 123511, grid=grid(2496))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ys/cysnmvaph32l2fx4uq56auk76xzeogfqlpd5wl5alqcd32enjld7.py
# Original ATen: aten.native_batch_norm_backward, aten.threshold_backward

# aten.native_batch_norm_backward => convert_element_type_796, sum_158
# aten.threshold_backward => scalar_tensor, where_4
triton_per_fused_native_batch_norm_backward_threshold_backward_230 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[256, 16],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}
)
@triton.jit
def triton_per_fused_native_batch_norm_backward_threshold_backward_230(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 192
    rnumel = 13
    RBLOCK: tl.constexpr = 16
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (192*r1)), rmask & xmask, other=0)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tl.store(out_ptr0 + x0, tmp3, xmask)


def get_args():
    arg_0 = rand_strided((192, 13), (1, 192), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused_native_batch_norm_backward_threshold_backward_230.run(*args, 192, 13, grid=grid(192), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused_native_batch_norm_backward_threshold_backward_230.benchmark_all_configs(*args, 192, 13, grid=grid(192))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/tq/ctqqxyxucpkck2pbfdk6hiij3w3ouutzihnufujtspvbizzcvokl.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward, aten.threshold_backward

# aten._native_batch_norm_legit_functional => convert_element_type_12
# aten.native_batch_norm_backward => convert_element_type_796, mul_1164, mul_1172, sub_338, sum_159
# aten.threshold_backward => scalar_tensor, where_4
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_231 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[256, 16],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_231(in_ptr0, in_ptr1, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 192
    rnumel = 13
    RBLOCK: tl.constexpr = 16
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (192*r1)), rmask & xmask, other=0)
    tmp4 = tl.load(in_ptr1 + (x0), xmask)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp5 = tmp3 * tmp4
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
    tl.store(out_ptr0 + x0, tmp3, xmask)


def get_args():
    arg_0 = rand_strided((192, 13), (1, 192), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_231.run(*args, 192, 13, grid=grid(192), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_231.benchmark_all_configs(*args, 192, 13, grid=grid(192))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/2c/c2cj7sokvc2ucpcatjevxoabxae6fcr5uq7unf2sn2ww5auw7tct.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward, aten.threshold_backward

# aten._native_batch_norm_legit_functional => convert_element_type_12
# aten.native_batch_norm_backward => convert_element_type_796, convert_element_type_798, mul_1170, mul_1171, sub_338, sub_340, sub_341
# aten.threshold_backward => scalar_tensor, where_4
triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_232 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[536870912], filename=__file__, meta={'signature': {0: '*fp16', 1: '*i1', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_232(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr):
    xnumel = 308281344
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 12544) % 192
    tmp0 = tl.load(in_ptr0 + (x3), None)
    tmp2 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32)
    tmp5 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
    tmp7 = tl.load(in_ptr2 + (x1), None)
    tmp9 = tl.load(in_ptr3 + (x1), None)
    tmp12 = tl.load(in_ptr4 + (x1), None)
    tmp17 = tl.load(in_ptr5 + (x1), None)
    tmp20 = tl.load(in_ptr6 + (x1), None)
    tmp1 = 0.0
    tmp3 = tl.where(tmp0, tmp1, tmp2)
    tmp4 = tmp3.to(tl.float32)
    tmp6 = tmp5.to(tl.float32)
    tmp8 = tmp6 - tmp7
    tmp10 = 6.228077168367346e-07
    tmp11 = tmp9 * tmp10
    tmp13 = tmp12 * tmp12
    tmp14 = tmp11 * tmp13
    tmp15 = tmp8 * tmp14
    tmp16 = tmp4 - tmp15
    tmp18 = tmp17 * tmp10
    tmp19 = tmp16 - tmp18
    tmp21 = tmp12 * tmp20
    tmp22 = tmp19 * tmp21
    tmp23 = tmp22.to(tl.float32)
    tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp23, None)


def get_args():
    arg_0 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.bool)
    arg_2 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 192, 1, 1), (192, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_232.run(*args, 308281344, grid=grid(308281344), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_232.benchmark_all_configs(*args, 308281344, grid=grid(308281344))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ug/cugvnwu4vijgcqk6aeukdnlyjpwwrweo5ekg2rzkt3wc2oyg6si6.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_799
triton_poi_fused__to_copy_233 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_233(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1536
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((96, 16, 1, 1), (16, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((96, 16, 1, 1), (16, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_233.run(*args, 1536, grid=grid(1536), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_233.benchmark_all_configs(*args, 1536, grid=grid(1536))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/sq/csq2s3gta567xaj4joh64czv7oue4eaxrvun3o6onmtg5rojpoec.py
# Original ATen: aten.cat

# aten.cat => cat_81
triton_poi_fused_cat_234 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_234(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 25690112
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 200704
    x1 = (xindex // 200704)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (401408*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 16, 112, 112), (200704, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 16, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_234.run(*args, 25690112, grid=grid(25690112), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_234.benchmark_all_configs(*args, 25690112, grid=grid(25690112))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/js/cjsqfjzxqv7rsfcedmto3ifg4qqtw6degpzalrk6qlwijztosgeh.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_8
# aten.native_batch_norm_backward => convert_element_type_801, mul_1173, sub_342, sum_160, sum_161
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_235 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[512, 131072],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_235(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 448
    rnumel = 114688
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex % 32
    x1 = (xindex // 32)
    _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    tmp5 = tl.load(in_ptr2 + (x0), xmask)
    _tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = tl.load(in_ptr0 + ((112*(((r2 + (114688*x1)) // 112) % 112)) + (12544*x0) + (401408*((r2 + (114688*x1)) // 12544)) + (r2 % 112)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp3 = tl.load(in_ptr1 + ((112*(((r2 + (114688*x1)) // 112) % 112)) + (12544*x0) + (401408*((r2 + (114688*x1)) // 12544)) + (r2 % 112)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
        tmp4 = tmp3.to(tl.float32)
        tmp6 = tmp4 - tmp5
        tmp7 = tmp1 * tmp6
        _tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8)
    tmp2 = tl.sum(_tmp2, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp2, xmask)
    tmp8 = tl.sum(_tmp8, 1)[:, None]
    tl.store(out_ptr1 + x3, tmp8, xmask)


def get_args():
    arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((1, 32, 1, 1), (32, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((32, 14), (1, 32), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((32, 14), (1, 32), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_235.run(*args, 448, 114688, grid=grid(448), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_235.benchmark_all_configs(*args, 448, 114688, grid=grid(448))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/un/cunmdmnzzky4hpfyncovvbl6z7364avkwmbstijt5sdqy6c3sndo.py
# Original ATen: aten.native_batch_norm_backward

# aten.native_batch_norm_backward => convert_element_type_801, sum_160
triton_per_fused_native_batch_norm_backward_236 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[32, 16],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}
)
@triton.jit
def triton_per_fused_native_batch_norm_backward_236(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 32
    rnumel = 14
    RBLOCK: tl.constexpr = 16
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (32*r1)), rmask & xmask, other=0)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tl.store(out_ptr0 + x0, tmp3, xmask)


def get_args():
    arg_0 = rand_strided((32, 14), (1, 32), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused_native_batch_norm_backward_236.run(*args, 32, 14, grid=grid(32), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused_native_batch_norm_backward_236.benchmark_all_configs(*args, 32, 14, grid=grid(32))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/wf/cwf5g4pysq3gqzxlojvd577tupirripwoebyow33p2p7ylbn53lm.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_8
# aten.native_batch_norm_backward => convert_element_type_801, mul_1173, mul_1181, sub_342, sum_161
triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_237 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[32, 16],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_237(in_ptr0, in_ptr1, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 32
    rnumel = 14
    RBLOCK: tl.constexpr = 16
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (32*r1)), rmask & xmask, other=0)
    tmp4 = tl.load(in_ptr1 + (x0), xmask)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp5 = tmp3 * tmp4
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
    tl.store(out_ptr0 + x0, tmp3, xmask)


def get_args():
    arg_0 = rand_strided((32, 14), (1, 32), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_237.run(*args, 32, 14, grid=grid(32), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_237.benchmark_all_configs(*args, 32, 14, grid=grid(32))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/6m/c6msee73vpt32tetzwls5fkhzvwfsszmuikkahpidywnybj6hbvg.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.convolution_backward, aten.native_batch_norm_backward

# aten._native_batch_norm_legit_functional => convert_element_type_8
# aten.convolution_backward => convolution_backward_152
# aten.native_batch_norm_backward => convert_element_type_801, convert_element_type_803, mul_1179, mul_1180, sub_342, sub_344, sub_345
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_native_batch_norm_backward_238 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp16', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_native_batch_norm_backward_238(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 51380224
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 12544) % 32
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
    tmp4 = tl.load(in_ptr2 + (x1), None)
    tmp6 = tl.load(in_ptr3 + (x1), None)
    tmp9 = tl.load(in_ptr4 + (x1), None)
    tmp14 = tl.load(in_ptr5 + (x1), None)
    tmp17 = tl.load(in_ptr6 + (x1), None)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp2.to(tl.float32)
    tmp5 = tmp3 - tmp4
    tmp7 = 6.228077168367346e-07
    tmp8 = tmp6 * tmp7
    tmp10 = tmp9 * tmp9
    tmp11 = tmp8 * tmp10
    tmp12 = tmp5 * tmp11
    tmp13 = tmp1 - tmp12
    tmp15 = tmp14 * tmp7
    tmp16 = tmp13 - tmp15
    tmp18 = tmp9 * tmp17
    tmp19 = tmp16 * tmp18
    tmp20 = tmp19.to(tl.float32)
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp20, None)


def get_args():
    arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((1, 32, 1, 1), (32, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_native_batch_norm_backward_238.run(*args, 51380224, grid=grid(51380224), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_native_batch_norm_backward_238.benchmark_all_configs(*args, 51380224, grid=grid(51380224))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/7k/c7krxyxxql7bxontztrlvaggrh6mqywuks2m4idv23yuou4gz2wy.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_804
triton_poi_fused__to_copy_239 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_239(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1024
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((32, 32, 1, 1), (32, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((32, 32, 1, 1), (32, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_239.run(*args, 1024, grid=grid(1024), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_239.benchmark_all_configs(*args, 1024, grid=grid(1024))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/7e/c7ep5i7oybdvegdm7cwjtpx47vlwexnhlgzvv4hni2dhifla6ilf.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.native_batch_norm_backward, aten.threshold_backward

# aten._native_batch_norm_legit_functional => convert_element_type_5
# aten.native_batch_norm_backward => convert_element_type_805, mul_1182, sub_346, sum_162, sum_163
# aten.threshold_backward => le_5, scalar_tensor, where_5
triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_240 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[512, 131072],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: 'i32', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_240(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 448
    rnumel = 114688
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex % 32
    x1 = (xindex // 32)
    _tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    tmp9 = tl.load(in_ptr3 + (x0), xmask)
    _tmp12 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = tl.load(in_ptr0 + ((112*(((r2 + (114688*x1)) // 112) % 112)) + (12544*x0) + (401408*((r2 + (114688*x1)) // 12544)) + (r2 % 112)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp3 = tl.load(in_ptr1 + ((112*(((r2 + (114688*x1)) // 112) % 112)) + (12544*x0) + (401408*((r2 + (114688*x1)) // 12544)) + (r2 % 112)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp7 = tl.load(in_ptr2 + ((112*(((r2 + (114688*x1)) // 112) % 112)) + (12544*x0) + (401408*((r2 + (114688*x1)) // 12544)) + (r2 % 112)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = 0.0
        tmp2 = tmp0 <= tmp1
        tmp4 = tl.where(tmp2, tmp1, tmp3)
        tmp5 = tmp4.to(tl.float32)
        _tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6)
        tmp8 = tmp7.to(tl.float32)
        tmp10 = tmp8 - tmp9
        tmp11 = tmp5 * tmp10
        _tmp12 = tl.where(rmask & xmask, _tmp12 + tmp11, _tmp12)
    tmp6 = tl.sum(_tmp6, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp6, xmask)
    tmp12 = tl.sum(_tmp12, 1)[:, None]
    tl.store(out_ptr1 + x3, tmp12, xmask)


def get_args():
    arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 32, 1, 1), (32, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((32, 14), (1, 32), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((32, 14), (1, 32), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_240.run(*args, 448, 114688, grid=grid(448), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_240.benchmark_all_configs(*args, 448, 114688, grid=grid(448))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/xb/cxbvbwtgotbwspd422aktzvto6a6qirjkhi57yyazyi4ebmvxve3.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.convolution_backward, aten.native_batch_norm_backward, aten.threshold_backward

# aten._native_batch_norm_legit_functional => convert_element_type_5
# aten.convolution_backward => convolution_backward_153
# aten.native_batch_norm_backward => convert_element_type_805, convert_element_type_807, mul_1188, mul_1189, sub_346, sub_348, sub_349
# aten.threshold_backward => le_5, scalar_tensor, where_5
triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_native_batch_norm_backward_threshold_backward_241 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_native_batch_norm_backward_threshold_backward_241(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr):
    xnumel = 51380224
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 12544) % 32
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp3 = tl.load(in_out_ptr0 + (x3), None).to(tl.float32)
    tmp6 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
    tmp8 = tl.load(in_ptr2 + (x1), None)
    tmp10 = tl.load(in_ptr3 + (x1), None)
    tmp13 = tl.load(in_ptr4 + (x1), None)
    tmp18 = tl.load(in_ptr5 + (x1), None)
    tmp21 = tl.load(in_ptr6 + (x1), None)
    tmp1 = 0.0
    tmp2 = tmp0 <= tmp1
    tmp4 = tl.where(tmp2, tmp1, tmp3)
    tmp5 = tmp4.to(tl.float32)
    tmp7 = tmp6.to(tl.float32)
    tmp9 = tmp7 - tmp8
    tmp11 = 6.228077168367346e-07
    tmp12 = tmp10 * tmp11
    tmp14 = tmp13 * tmp13
    tmp15 = tmp12 * tmp14
    tmp16 = tmp9 * tmp15
    tmp17 = tmp5 - tmp16
    tmp19 = tmp18 * tmp11
    tmp20 = tmp17 - tmp19
    tmp22 = tmp13 * tmp21
    tmp23 = tmp20 * tmp22
    tmp24 = tmp23.to(tl.float32)
    tl.store(in_out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp24, None)


def get_args():
    arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((1, 32, 1, 1), (32, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_native_batch_norm_backward_threshold_backward_241.run(*args, 51380224, grid=grid(51380224), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_native_batch_norm_backward_threshold_backward_241.benchmark_all_configs(*args, 51380224, grid=grid(51380224))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/oj/cojqompm7misslrpvtl7ybsy7rsgvmn4juo7mjxlwkes4u6yrrna.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_808
triton_poi_fused__to_copy_242 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[512], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_242(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 288
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((32, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((32, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_242.run(*args, 288, grid=grid(288), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_242.benchmark_all_configs(*args, 288, grid=grid(288))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/w4/cw4gnxluz6sduqkpdtk54qml7vehohzccmsuenbbqypvp5lck7fy.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.native_batch_norm_backward, aten.threshold_backward

# aten._native_batch_norm_legit_functional => convert_element_type_2
# aten.add => add_381
# aten.native_batch_norm_backward => convert_element_type_809, mul_1191, sub_350, sum_164, sum_165
# aten.threshold_backward => le_6, scalar_tensor, where_6
triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_threshold_backward_243 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[512, 131072],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_threshold_backward_243(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 448
    rnumel = 114688
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex % 32
    x1 = (xindex // 32)
    _tmp8 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    tmp11 = tl.load(in_ptr4 + (x0), xmask)
    _tmp14 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = tl.load(in_ptr0 + ((112*(((r2 + (114688*x1)) // 112) % 112)) + (12544*x0) + (401408*((r2 + (114688*x1)) // 12544)) + (r2 % 112)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp3 = tl.load(in_ptr1 + ((112*(((r2 + (114688*x1)) // 112) % 112)) + (12544*x0) + (401408*((r2 + (114688*x1)) // 12544)) + (r2 % 112)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp4 = tl.load(in_ptr2 + ((112*(((r2 + (114688*x1)) // 112) % 112)) + (12544*x0) + (401408*((r2 + (114688*x1)) // 12544)) + (r2 % 112)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp9 = tl.load(in_ptr3 + ((112*(((r2 + (114688*x1)) // 112) % 112)) + (12544*x0) + (401408*((r2 + (114688*x1)) // 12544)) + (r2 % 112)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = 0.0
        tmp2 = tmp0 <= tmp1
        tmp5 = tmp3 + tmp4
        tmp6 = tl.where(tmp2, tmp1, tmp5)
        tmp7 = tmp6.to(tl.float32)
        _tmp8 = tl.where(rmask & xmask, _tmp8 + tmp7, _tmp8)
        tmp10 = tmp9.to(tl.float32)
        tmp12 = tmp10 - tmp11
        tmp13 = tmp7 * tmp12
        _tmp14 = tl.where(rmask & xmask, _tmp14 + tmp13, _tmp14)
    tmp8 = tl.sum(_tmp8, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp8, xmask)
    tmp14 = tl.sum(_tmp14, 1)[:, None]
    tl.store(out_ptr1 + x3, tmp14, xmask)


def get_args():
    arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((1, 32, 1, 1), (32, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((32, 14), (1, 32), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((32, 14), (1, 32), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_threshold_backward_243.run(*args, 448, 114688, grid=grid(448), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_threshold_backward_243.benchmark_all_configs(*args, 448, 114688, grid=grid(448))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ob/cobmv47m4hftiufftthbknhbqe67o3mathld7aplwt4zh3s3vn2t.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.convolution_backward, aten.native_batch_norm_backward, aten.threshold_backward

# aten._native_batch_norm_legit_functional => convert_element_type_2
# aten.add => add_381
# aten.convolution_backward => convolution_backward_154
# aten.native_batch_norm_backward => convert_element_type_809, convert_element_type_811, mul_1197, mul_1198, sub_350, sub_352, sub_353
# aten.threshold_backward => le_6, scalar_tensor, where_6
triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_threshold_backward_244 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: '*fp16', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp16', 10: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_threshold_backward_244(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, out_ptr1, xnumel, XBLOCK : tl.constexpr):
    xnumel = 51380224
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 12544) % 32
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp3 = tl.load(in_ptr1 + (x3), None).to(tl.float32)
    tmp4 = tl.load(in_ptr2 + (x3), None).to(tl.float32)
    tmp8 = tl.load(in_ptr3 + (x3), None).to(tl.float32)
    tmp10 = tl.load(in_ptr4 + (x1), None)
    tmp12 = tl.load(in_ptr5 + (x1), None)
    tmp15 = tl.load(in_ptr6 + (x1), None)
    tmp20 = tl.load(in_ptr7 + (x1), None)
    tmp23 = tl.load(in_ptr8 + (x1), None)
    tmp1 = 0.0
    tmp2 = tmp0 <= tmp1
    tmp5 = tmp3 + tmp4
    tmp6 = tl.where(tmp2, tmp1, tmp5)
    tmp7 = tmp6.to(tl.float32)
    tmp9 = tmp8.to(tl.float32)
    tmp11 = tmp9 - tmp10
    tmp13 = 6.228077168367346e-07
    tmp14 = tmp12 * tmp13
    tmp16 = tmp15 * tmp15
    tmp17 = tmp14 * tmp16
    tmp18 = tmp11 * tmp17
    tmp19 = tmp7 - tmp18
    tmp21 = tmp20 * tmp13
    tmp22 = tmp19 - tmp21
    tmp24 = tmp15 * tmp23
    tmp25 = tmp22 * tmp24
    tmp26 = tmp25.to(tl.float32)
    tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp26, None)


def get_args():
    arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_3 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_4 = rand_strided((1, 32, 1, 1), (32, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
    arg_8 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
    arg_9 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_threshold_backward_244.run(*args, 51380224, grid=grid(51380224), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_threshold_backward_244.benchmark_all_configs(*args, 51380224, grid=grid(51380224))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/gd/cgdntdeldc5fbhs5emwmc54vd4hg2eeljdsb6fcby2wncgyzpb5r.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_812
triton_poi_fused__to_copy_245 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_245(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 864
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((32, 3, 3, 3), (27, 9, 3, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((32, 3, 3, 3), (27, 9, 3, 1), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_245.run(*args, 864, grid=grid(864), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_245.benchmark_all_configs(*args, 864, grid=grid(864))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


async_compile.wait(globals())
del async_compile

def call(args):
    primals_1, primals_3, primals_5, primals_7, primals_9, primals_11, primals_13, primals_15, primals_17, primals_19, primals_21, primals_23, primals_25, primals_27, primals_29, primals_31, primals_33, primals_35, primals_37, primals_39, primals_41, primals_43, primals_45, primals_47, primals_49, primals_51, primals_53, primals_55, primals_57, primals_59, primals_61, primals_63, primals_65, primals_67, primals_69, primals_71, primals_73, primals_75, primals_77, primals_79, primals_81, primals_83, primals_85, primals_87, primals_89, primals_91, primals_93, primals_95, primals_97, primals_99, primals_101, primals_103, primals_105, primals_107, primals_109, primals_111, primals_113, primals_115, convert_element_type, convert_element_type_1, convolution, squeeze_1, relu, convert_element_type_4, convolution_1, squeeze_4, relu_1, convert_element_type_7, convolution_2, squeeze_7, getitem_6, getitem_7, convert_element_type_10, convert_element_type_11, cat, squeeze_10, convert_element_type_14, getitem_13, convert_element_type_15, getitem_17, convert_element_type_16, getitem_21, cat_1, squeeze_13, convert_element_type_19, getitem_26, convert_element_type_20, getitem_29, cat_2, squeeze_16, getitem_32, getitem_33, convert_element_type_23, convert_element_type_24, cat_3, squeeze_19, relu_4, convert_element_type_27, convolution_12, squeeze_22, convert_element_type_30, getitem_40, convert_element_type_31, getitem_43, cat_4, squeeze_25, add_46, convert_element_type_34, convolution_15, squeeze_28, convert_element_type_39, getitem_52, convert_element_type_40, getitem_57, convert_element_type_41, getitem_62, convert_element_type_42, getitem_67, cat_5, squeeze_31, convert_element_type_44, mean, convert_element_type_48, convolution_20, convert_element_type_50, convert_element_type_52, convolution_21, mul_80, convert_element_type_53, convolution_22, squeeze_34, getitem_72, getitem_73, convert_element_type_56, convert_element_type_57, cat_6, squeeze_37, convert_element_type_62, getitem_78, convert_element_type_63, getitem_81, cat_7, squeeze_40, convert_element_type_65, mean_1, convert_element_type_69, convolution_27, convert_element_type_71, convert_element_type_73, convolution_28, getitem_84, getitem_85, convert_element_type_74, convert_element_type_75, cat_8, squeeze_43, getitem_88, getitem_89, convert_element_type_78, convert_element_type_79, cat_9, squeeze_46, convert_element_type_84, getitem_94, convert_element_type_85, getitem_97, cat_10, squeeze_49, convert_element_type_87, mean_2, convert_element_type_91, convolution_35, convert_element_type_93, convert_element_type_95, convolution_36, getitem_100, getitem_101, convert_element_type_96, convert_element_type_97, cat_11, squeeze_52, getitem_104, getitem_105, convert_element_type_100, convert_element_type_101, cat_12, squeeze_55, convert_element_type_106, getitem_110, convert_element_type_107, getitem_113, cat_13, squeeze_58, convert_element_type_109, mean_3, convert_element_type_113, convolution_43, convert_element_type_115, convert_element_type_117, convolution_44, getitem_116, getitem_117, convert_element_type_118, convert_element_type_119, cat_14, squeeze_61, add_109, convert_element_type_122, convolution_47, squeeze_64, convert_element_type_127, getitem_125, convert_element_type_128, getitem_129, convert_element_type_129, getitem_133, cat_15, squeeze_67, convert_element_type_131, mean_4, convert_element_type_135, convolution_51, convert_element_type_137, convert_element_type_139, convolution_52, mul_180, convert_element_type_140, convolution_53, squeeze_70, getitem_138, getitem_139, convert_element_type_143, convert_element_type_144, cat_16, squeeze_73, convert_element_type_149, getitem_146, convert_element_type_150, getitem_151, convert_element_type_151, getitem_156, convert_element_type_152, getitem_161, cat_17, squeeze_76, convert_element_type_154, mean_5, convert_element_type_158, convolution_60, convert_element_type_160, convert_element_type_162, convolution_61, getitem_164, getitem_165, convert_element_type_163, convert_element_type_164, cat_18, squeeze_79, getitem_168, getitem_169, convert_element_type_167, convert_element_type_168, cat_19, squeeze_82, convert_element_type_173, getitem_176, convert_element_type_174, getitem_181, convert_element_type_175, getitem_186, convert_element_type_176, getitem_191, cat_20, squeeze_85, convert_element_type_178, mean_6, convert_element_type_182, convolution_70, convert_element_type_184, convert_element_type_186, convolution_71, getitem_194, getitem_195, convert_element_type_187, convert_element_type_188, cat_21, squeeze_88, getitem_198, getitem_199, convert_element_type_191, convert_element_type_192, cat_22, squeeze_91, convert_element_type_197, getitem_206, convert_element_type_198, getitem_211, convert_element_type_199, getitem_216, convert_element_type_200, getitem_221, cat_23, squeeze_94, convert_element_type_202, mean_7, convert_element_type_206, convolution_80, convert_element_type_208, convert_element_type_210, convolution_81, getitem_224, getitem_225, convert_element_type_211, convert_element_type_212, cat_24, squeeze_97, add_172, convert_element_type_215, convolution_84, squeeze_100, convert_element_type_219, convert_element_type_220, convolution_85, squeeze_103, convert_element_type_222, mean_8, convert_element_type_226, convolution_86, convert_element_type_228, convert_element_type_230, convolution_87, mul_280, convert_element_type_231, convolution_88, squeeze_106, getitem_234, getitem_235, convert_element_type_234, convert_element_type_235, cat_25, squeeze_109, convert_element_type_240, getitem_242, convert_element_type_241, getitem_247, convert_element_type_242, getitem_252, convert_element_type_243, getitem_257, cat_26, squeeze_112, convert_element_type_245, mean_9, convert_element_type_249, convolution_95, convert_element_type_251, convert_element_type_253, convolution_96, getitem_260, getitem_261, convert_element_type_254, convert_element_type_255, cat_27, squeeze_115, getitem_264, getitem_265, convert_element_type_258, convert_element_type_259, cat_28, squeeze_118, convert_element_type_264, getitem_272, convert_element_type_265, getitem_277, convert_element_type_266, getitem_282, convert_element_type_267, getitem_287, cat_29, squeeze_121, convert_element_type_269, mean_10, convert_element_type_273, convolution_105, convert_element_type_275, convert_element_type_277, convolution_106, getitem_290, getitem_291, convert_element_type_278, convert_element_type_279, cat_30, squeeze_124, getitem_294, getitem_295, convert_element_type_282, convert_element_type_283, cat_31, squeeze_127, convert_element_type_288, getitem_302, convert_element_type_289, getitem_307, convert_element_type_290, getitem_312, convert_element_type_291, getitem_317, cat_32, squeeze_130, convert_element_type_293, mean_11, convert_element_type_297, convolution_115, convert_element_type_299, convert_element_type_301, convolution_116, getitem_320, getitem_321, convert_element_type_302, convert_element_type_303, cat_33, squeeze_133, add_235, convert_element_type_306, convolution_119, squeeze_136, convert_element_type_311, getitem_330, convert_element_type_312, getitem_335, convert_element_type_313, getitem_340, convert_element_type_314, getitem_345, cat_34, squeeze_139, convert_element_type_316, mean_12, convert_element_type_320, convolution_124, convert_element_type_322, convert_element_type_324, convolution_125, mul_380, convert_element_type_325, convolution_126, squeeze_142, convert_element_type_327, convert_element_type_328, convolution_127, squeeze_145, convert_element_type_333, getitem_356, convert_element_type_334, getitem_361, convert_element_type_335, getitem_366, convert_element_type_336, getitem_371, cat_35, squeeze_148, convert_element_type_338, mean_13, convert_element_type_342, convolution_132, convert_element_type_344, convert_element_type_346, convolution_133, getitem_374, getitem_375, convert_element_type_347, convert_element_type_348, cat_36, squeeze_151, add_266, convert_element_type_351, convolution_136, squeeze_154, convert_element_type_356, getitem_384, convert_element_type_357, getitem_389, convert_element_type_358, getitem_394, convert_element_type_359, getitem_399, cat_37, squeeze_157, convert_element_type_361, mean_14, convert_element_type_365, convolution_141, convert_element_type_367, convert_element_type_369, convolution_142, getitem_402, getitem_403, convert_element_type_370, convert_element_type_371, cat_38, squeeze_160, add_282, convert_element_type_374, convolution_145, squeeze_163, convert_element_type_379, getitem_412, convert_element_type_380, getitem_417, convert_element_type_381, getitem_422, convert_element_type_382, getitem_427, cat_39, squeeze_166, convert_element_type_384, mean_15, convert_element_type_388, convolution_150, convert_element_type_390, convert_element_type_392, convolution_151, getitem_430, getitem_431, convert_element_type_393, convert_element_type_394, cat_40, squeeze_169, add_298, convert_element_type_397, convolution_154, squeeze_172, view, permute_1, le, unsqueeze_234, unsqueeze_246, unsqueeze_258, mul_508, unsqueeze_270, unsqueeze_282, unsqueeze_294, mul_548, unsqueeze_306, unsqueeze_318, unsqueeze_330, mul_588, unsqueeze_342, unsqueeze_354, unsqueeze_366, mul_628, unsqueeze_378, unsqueeze_390, unsqueeze_402, mul_668, unsqueeze_414, unsqueeze_426, unsqueeze_438, mul_708, unsqueeze_450, unsqueeze_462, unsqueeze_474, mul_748, unsqueeze_486, unsqueeze_498, unsqueeze_510, mul_788, unsqueeze_522, unsqueeze_534, unsqueeze_546, mul_828, unsqueeze_558, unsqueeze_570, unsqueeze_582, mul_868, unsqueeze_594, unsqueeze_606, unsqueeze_618, mul_908, unsqueeze_630, unsqueeze_642, unsqueeze_654, mul_948, unsqueeze_666, unsqueeze_678, unsqueeze_690, mul_988, unsqueeze_702, unsqueeze_714, unsqueeze_726, mul_1028, unsqueeze_738, unsqueeze_750, unsqueeze_762, mul_1068, unsqueeze_774, unsqueeze_786, unsqueeze_798, mul_1108, unsqueeze_810, unsqueeze_822, le_1, unsqueeze_834, unsqueeze_846, unsqueeze_858, le_3, unsqueeze_870, le_4, unsqueeze_882, unsqueeze_894, unsqueeze_906, unsqueeze_918, tangents_1, tangents_2, tangents_3, tangents_4, tangents_5, tangents_6, tangents_7, tangents_8, tangents_9, tangents_10, tangents_11, tangents_12, tangents_13, tangents_14, tangents_15, tangents_16, tangents_17, tangents_18, tangents_19, tangents_20, tangents_21, tangents_22, tangents_23, tangents_24, tangents_25, tangents_26, tangents_27, tangents_28, tangents_29, tangents_30, tangents_31, tangents_32, tangents_33, tangents_34, tangents_35, tangents_36, tangents_37, tangents_38, tangents_39, tangents_40, tangents_41, tangents_42, tangents_43, tangents_44, tangents_45, tangents_46, tangents_47, tangents_48, tangents_49, tangents_50, tangents_51, tangents_52, tangents_53, tangents_54, tangents_55, tangents_56, tangents_57, tangents_58, tangents_59, tangents_60, tangents_61, tangents_62, tangents_63, tangents_64, tangents_65, tangents_66, tangents_67, tangents_68, tangents_69, tangents_70, tangents_71, tangents_72, tangents_73, tangents_74, tangents_75, tangents_76, tangents_77, tangents_78, tangents_79, tangents_80, tangents_81, tangents_82, tangents_83, tangents_84, tangents_85, tangents_86, tangents_87, tangents_88, tangents_89, tangents_90, tangents_91, tangents_92, tangents_93, tangents_94, tangents_95, tangents_96, tangents_97, tangents_98, tangents_99, tangents_100, tangents_101, tangents_102, tangents_103, tangents_104, tangents_105, tangents_106, tangents_107, tangents_108, tangents_109, tangents_110, tangents_111, tangents_112, tangents_113, tangents_114, tangents_115, tangents_116, tangents_117, tangents_118, tangents_119, tangents_120, tangents_121, tangents_122, tangents_123, tangents_124, tangents_125, tangents_126, tangents_127, tangents_128, tangents_129, tangents_130, tangents_131, tangents_132, tangents_133, tangents_134, tangents_135, tangents_136, tangents_137, tangents_138, tangents_139, tangents_140, tangents_141, tangents_142, tangents_143, tangents_144, tangents_145, tangents_146, tangents_147, tangents_148, tangents_149, tangents_150, tangents_151, tangents_152, tangents_153, tangents_154, tangents_155, tangents_156, tangents_157, tangents_158, tangents_159, tangents_160, tangents_161, tangents_162, tangents_163, tangents_164, tangents_165, tangents_166, tangents_167, tangents_168, tangents_169, tangents_170, tangents_171, tangents_172, tangents_173, tangents_174, tangents_175 = args
    args.clear()
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0) # no-op to ensure context
        buf0 = empty_strided((128, 1536), (1536, 1), device='cuda', dtype=torch.float16)
        extern_kernels.mm(tangents_175, permute_1, out=buf0)
        del permute_1
        buf1 = empty_strided((1000, 1536), (1536, 1), device='cuda', dtype=torch.float16)
        extern_kernels.mm(as_strided(tangents_175, (1000, 128), (1, 1000)), view, out=buf1)
        del view
        buf4 = empty_strided((1000, ), (1, ), device='cuda', dtype=torch.float32)
        stream0 = get_cuda_stream(0)
        triton_red_fused__to_copy_sum_0.run(tangents_175, buf4, 1000, 128, grid=grid(1000), stream=stream0)
        del tangents_175
        buf3 = empty_strided((1000, 1536), (1536, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_1.run(buf1, buf3, 1536000, grid=grid(1536000), stream=stream0)
        del buf1
        buf5 = empty_strided((1536, ), (1, ), device='cuda', dtype=torch.float32)
        buf6 = empty_strided((1536, ), (1, ), device='cuda', dtype=torch.float32)
        buf7 = empty_strided((1536, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_div_native_batch_norm_backward_threshold_backward_2.run(le, buf0, convolution_154, unsqueeze_234, squeeze_172, buf5, buf6, buf7, 1536, 6272, grid=grid(1536), stream=stream0)
        buf8 = empty_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_div_native_batch_norm_backward_threshold_backward_3.run(le, buf0, convolution_154, unsqueeze_234, buf6, squeeze_172, buf5, primals_115, buf8, 9633792, grid=grid(9633792), stream=stream0)
        del buf0
        del convolution_154
        del le
        del primals_115
        del squeeze_172
        del unsqueeze_234
        buf9 = aten.convolution_backward(buf8, add_298, convert_element_type_397, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del add_298
        del buf8
        del convert_element_type_397
        buf10 = buf9[0]
        assert_size_stride(buf10, (128, 264, 7, 7), (12936, 49, 7, 1))
        buf11 = buf9[1]
        assert_size_stride(buf11, (1536, 264, 1, 1), (264, 1, 1, 1))
        del buf9
        buf12 = empty_strided((1536, 264, 1, 1), (264, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_4.run(buf11, buf12, 405504, grid=grid(405504), stream=stream0)
        del buf11
        buf13 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
        buf14 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
        buf15 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_5.run(buf10, cat_40, unsqueeze_246, squeeze_169, buf13, buf14, buf15, 264, 6272, grid=grid(264), stream=stream0)
        buf16 = empty_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_6.run(buf10, cat_40, unsqueeze_246, buf14, squeeze_169, buf13, primals_113, buf16, 1655808, grid=grid(1655808), stream=stream0)
        del cat_40
        del primals_113
        del squeeze_169
        del unsqueeze_246
        buf17 = aten.convolution_backward(as_strided(buf16, (128, 132, 7, 7), (12936, 49, 7, 1), 6468), getitem_431, convert_element_type_394, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del convert_element_type_394
        del getitem_431
        buf18 = buf17[0]
        assert_size_stride(buf18, (128, 792, 7, 7), (38808, 49, 7, 1))
        buf19 = buf17[1]
        assert_size_stride(buf19, (132, 792, 1, 1), (792, 1, 1, 1))
        del buf17
        buf20 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_7.run(buf19, buf20, 104544, grid=grid(104544), stream=stream0)
        del buf19
        buf21 = aten.convolution_backward(as_strided(buf16, (128, 132, 7, 7), (12936, 49, 7, 1)), getitem_430, convert_element_type_393, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del convert_element_type_393
        del getitem_430
        buf22 = buf21[0]
        assert_size_stride(buf22, (128, 792, 7, 7), (38808, 49, 7, 1))
        buf23 = buf21[1]
        assert_size_stride(buf23, (132, 792, 1, 1), (792, 1, 1, 1))
        del buf21
        buf24 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_7.run(buf23, buf24, 104544, grid=grid(104544), stream=stream0)
        del buf23
        buf27 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16)
        buf25 = as_strided(buf27, (128, 792, 7, 7), (77616, 49, 7, 1))  # alias
        triton_poi_fused_cat_8.run(buf22, buf25, 4967424, grid=grid(4967424), stream=stream0)
        del buf22
        buf26 = as_strided(buf27, (128, 792, 7, 7), (77616, 49, 7, 1), 38808)  # alias
        triton_poi_fused_cat_9.run(buf18, buf26, 4967424, grid=grid(4967424), stream=stream0)
        del buf18
        buf28 = empty_strided((128, 1584, 1, 1), (1584, 1, 202752, 202752), device='cuda', dtype=torch.float16)
        buf29 = as_strided(buf28, (128, 1584, 1, 1), (1584, 1, 1, 1)); del buf28  # reuse
        triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_10.run(buf29, buf27, convert_element_type_384, convolution_151, 202752, 49, grid=grid(202752), stream=stream0)
        del buf25
        del buf26
        buf35 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__to_copy_convolution_backward_11.run(buf29, buf35, 1584, 128, grid=grid(1584), stream=stream0)
        buf31 = aten.convolution_backward(buf29, convert_element_type_390, convert_element_type_392, [1584], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del buf29
        del convert_element_type_390
        del convert_element_type_392
        buf32 = buf31[0]
        assert_size_stride(buf32, (128, 132, 1, 1), (132, 1, 1, 1))
        buf33 = buf31[1]
        assert_size_stride(buf33, (1584, 132, 1, 1), (132, 1, 1, 1))
        del buf31
        buf34 = empty_strided((1584, 132, 1, 1), (132, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_12.run(buf33, buf34, 209088, grid=grid(209088), stream=stream0)
        del buf33
        buf37 = buf32; del buf32  # reuse
        triton_poi_fused_add_clone_fill_mul_sigmoid_sub_13.run(buf37, convolution_150, 16896, grid=grid(16896), stream=stream0)
        del convolution_150
        buf43 = empty_strided((132, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__to_copy_convolution_backward_14.run(buf37, buf43, 132, 128, grid=grid(132), stream=stream0)
        buf39 = aten.convolution_backward(buf37, mean_15, convert_element_type_388, [132], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del buf37
        del convert_element_type_388
        del mean_15
        buf40 = buf39[0]
        assert_size_stride(buf40, (128, 1584, 1, 1), (1584, 1, 1, 1))
        buf41 = buf39[1]
        assert_size_stride(buf41, (132, 1584, 1, 1), (1584, 1, 1, 1))
        del buf39
        buf42 = empty_strided((132, 1584, 1, 1), (1584, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_15.run(buf41, buf42, 209088, grid=grid(209088), stream=stream0)
        del buf41
        buf45 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        buf46 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        buf48 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_16.run(buf27, convolution_151, buf40, convert_element_type_384, cat_39, unsqueeze_258, squeeze_166, buf45, buf46, buf48, 1584, 6272, grid=grid(1584), stream=stream0)
        buf47 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_17.run(buf27, convolution_151, buf40, convert_element_type_384, cat_39, unsqueeze_258, buf46, squeeze_166, buf45, buf47, 9934848, grid=grid(9934848), stream=stream0)
        del cat_39
        del convert_element_type_384
        del convolution_151
        del unsqueeze_258
        buf49 = empty_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_convolution_backward_18.run(buf47, squeeze_166, primals_111, buf49, 2483712, grid=grid(2483712), stream=stream0)
        buf50 = aten.convolution_backward(buf49, getitem_427, convert_element_type_382, [0], [1, 1], [4, 4], [1, 1], False, [0, 0], 396, [True, True, False])
        del convert_element_type_382
        del getitem_427
        buf51 = buf50[0]
        assert_size_stride(buf51, (128, 396, 7, 7), (19404, 49, 7, 1))
        buf52 = buf50[1]
        assert_size_stride(buf52, (396, 1, 9, 9), (81, 81, 9, 1))
        del buf50
        buf53 = empty_strided((396, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_19.run(buf52, buf53, 32076, grid=grid(32076), stream=stream0)
        del buf52
        buf54 = buf49; del buf49  # reuse
        triton_poi_fused_convolution_backward_20.run(buf47, squeeze_166, primals_111, buf54, 2483712, grid=grid(2483712), stream=stream0)
        buf55 = aten.convolution_backward(buf54, getitem_422, convert_element_type_381, [0], [1, 1], [3, 3], [1, 1], False, [0, 0], 396, [True, True, False])
        del convert_element_type_381
        del getitem_422
        buf56 = buf55[0]
        assert_size_stride(buf56, (128, 396, 7, 7), (19404, 49, 7, 1))
        buf57 = buf55[1]
        assert_size_stride(buf57, (396, 1, 7, 7), (49, 49, 7, 1))
        del buf55
        buf58 = empty_strided((396, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_21.run(buf57, buf58, 19404, grid=grid(19404), stream=stream0)
        del buf57
        buf59 = buf54; del buf54  # reuse
        triton_poi_fused_convolution_backward_22.run(buf47, squeeze_166, primals_111, buf59, 2483712, grid=grid(2483712), stream=stream0)
        buf60 = aten.convolution_backward(buf59, getitem_417, convert_element_type_380, [0], [1, 1], [2, 2], [1, 1], False, [0, 0], 396, [True, True, False])
        del convert_element_type_380
        del getitem_417
        buf61 = buf60[0]
        assert_size_stride(buf61, (128, 396, 7, 7), (19404, 49, 7, 1))
        buf62 = buf60[1]
        assert_size_stride(buf62, (396, 1, 5, 5), (25, 25, 5, 1))
        del buf60
        buf63 = empty_strided((396, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_23.run(buf62, buf63, 9900, grid=grid(9900), stream=stream0)
        del buf62
        buf64 = buf59; del buf59  # reuse
        triton_poi_fused_convolution_backward_24.run(buf47, squeeze_166, primals_111, buf64, 2483712, grid=grid(2483712), stream=stream0)
        del primals_111
        del squeeze_166
        buf65 = aten.convolution_backward(buf64, getitem_412, convert_element_type_379, [0], [1, 1], [1, 1], [1, 1], False, [0, 0], 396, [True, True, False])
        del buf64
        del convert_element_type_379
        del getitem_412
        buf66 = buf65[0]
        assert_size_stride(buf66, (128, 396, 7, 7), (19404, 49, 7, 1))
        buf67 = buf65[1]
        assert_size_stride(buf67, (396, 1, 3, 3), (9, 9, 3, 1))
        del buf65
        buf68 = empty_strided((396, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_25.run(buf67, buf68, 3564, grid=grid(3564), stream=stream0)
        del buf67
        buf73 = buf27; del buf27  # reuse
        buf69 = as_strided(buf73, (128, 396, 7, 7), (77616, 49, 7, 1))  # alias
        triton_poi_fused_cat_26.run(buf66, buf69, 2483712, grid=grid(2483712), stream=stream0)
        del buf66
        buf70 = as_strided(buf73, (128, 396, 7, 7), (77616, 49, 7, 1), 19404)  # alias
        triton_poi_fused_cat_27.run(buf61, buf70, 2483712, grid=grid(2483712), stream=stream0)
        del buf61
        buf71 = as_strided(buf73, (128, 396, 7, 7), (77616, 49, 7, 1), 38808)  # alias
        triton_poi_fused_cat_27.run(buf56, buf71, 2483712, grid=grid(2483712), stream=stream0)
        del buf56
        buf72 = as_strided(buf73, (128, 396, 7, 7), (77616, 49, 7, 1), 58212)  # alias
        triton_poi_fused_cat_27.run(buf51, buf72, 2483712, grid=grid(2483712), stream=stream0)
        buf74 = buf46; del buf46  # reuse
        buf75 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        buf76 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_28.run(buf73, mul_508, convolution_145, unsqueeze_270, squeeze_163, buf74, buf75, buf76, 1584, 6272, grid=grid(1584), stream=stream0)
        del buf69
        del buf70
        del buf71
        del buf72
        buf77 = buf73; del buf73  # reuse
        triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_29.run(buf77, mul_508, convolution_145, unsqueeze_270, buf75, squeeze_163, buf74, primals_109, 9934848, grid=grid(9934848), stream=stream0)
        del convolution_145
        del mul_508
        del primals_109
        del squeeze_163
        del unsqueeze_270
        buf78 = aten.convolution_backward(buf77, add_282, convert_element_type_374, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del add_282
        del convert_element_type_374
        buf79 = buf78[0]
        assert_size_stride(buf79, (128, 264, 7, 7), (12936, 49, 7, 1))
        buf80 = buf78[1]
        assert_size_stride(buf80, (1584, 264, 1, 1), (264, 1, 1, 1))
        del buf78
        buf81 = empty_strided((1584, 264, 1, 1), (264, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_30.run(buf80, buf81, 418176, grid=grid(418176), stream=stream0)
        del buf80
        buf82 = buf14; del buf14  # reuse
        buf83 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
        buf84 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_31.run(buf10, buf79, cat_38, unsqueeze_282, squeeze_160, buf82, buf83, buf84, 264, 6272, grid=grid(264), stream=stream0)
        buf85 = buf16; del buf16  # reuse
        triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_32.run(buf10, buf79, cat_38, unsqueeze_282, buf83, squeeze_160, buf82, primals_107, buf85, 1655808, grid=grid(1655808), stream=stream0)
        del cat_38
        del primals_107
        del squeeze_160
        del unsqueeze_282
        buf86 = aten.convolution_backward(as_strided(buf85, (128, 132, 7, 7), (12936, 49, 7, 1), 6468), getitem_403, convert_element_type_371, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del convert_element_type_371
        del getitem_403
        buf87 = buf86[0]
        assert_size_stride(buf87, (128, 792, 7, 7), (38808, 49, 7, 1))
        buf88 = buf86[1]
        assert_size_stride(buf88, (132, 792, 1, 1), (792, 1, 1, 1))
        del buf86
        buf89 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_7.run(buf88, buf89, 104544, grid=grid(104544), stream=stream0)
        del buf88
        buf90 = aten.convolution_backward(as_strided(buf85, (128, 132, 7, 7), (12936, 49, 7, 1)), getitem_402, convert_element_type_370, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del convert_element_type_370
        del getitem_402
        buf91 = buf90[0]
        assert_size_stride(buf91, (128, 792, 7, 7), (38808, 49, 7, 1))
        buf92 = buf90[1]
        assert_size_stride(buf92, (132, 792, 1, 1), (792, 1, 1, 1))
        del buf90
        buf93 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_7.run(buf92, buf93, 104544, grid=grid(104544), stream=stream0)
        del buf92
        buf96 = buf77; del buf77  # reuse
        buf94 = as_strided(buf96, (128, 792, 7, 7), (77616, 49, 7, 1))  # alias
        triton_poi_fused_cat_8.run(buf91, buf94, 4967424, grid=grid(4967424), stream=stream0)
        del buf91
        buf95 = as_strided(buf96, (128, 792, 7, 7), (77616, 49, 7, 1), 38808)  # alias
        triton_poi_fused_cat_9.run(buf87, buf95, 4967424, grid=grid(4967424), stream=stream0)
        del buf87
        buf97 = as_strided(buf40, (128, 1584, 1, 1), (1584, 1, 202752, 202752)); del buf40  # reuse
        buf98 = as_strided(buf97, (128, 1584, 1, 1), (1584, 1, 1, 1)); del buf97  # reuse
        triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_10.run(buf98, buf96, convert_element_type_361, convolution_142, 202752, 49, grid=grid(202752), stream=stream0)
        del buf94
        del buf95
        buf104 = buf75; del buf75  # reuse
        triton_per_fused__to_copy_convolution_backward_11.run(buf98, buf104, 1584, 128, grid=grid(1584), stream=stream0)
        buf100 = aten.convolution_backward(buf98, convert_element_type_367, convert_element_type_369, [1584], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del buf98
        del convert_element_type_367
        del convert_element_type_369
        buf101 = buf100[0]
        assert_size_stride(buf101, (128, 132, 1, 1), (132, 1, 1, 1))
        buf102 = buf100[1]
        assert_size_stride(buf102, (1584, 132, 1, 1), (132, 1, 1, 1))
        del buf100
        buf103 = empty_strided((1584, 132, 1, 1), (132, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_12.run(buf102, buf103, 209088, grid=grid(209088), stream=stream0)
        del buf102
        buf106 = buf101; del buf101  # reuse
        triton_poi_fused_add_clone_fill_mul_sigmoid_sub_13.run(buf106, convolution_141, 16896, grid=grid(16896), stream=stream0)
        del convolution_141
        buf112 = empty_strided((132, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__to_copy_convolution_backward_14.run(buf106, buf112, 132, 128, grid=grid(132), stream=stream0)
        buf108 = aten.convolution_backward(buf106, mean_14, convert_element_type_365, [132], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del buf106
        del convert_element_type_365
        del mean_14
        buf109 = buf108[0]
        assert_size_stride(buf109, (128, 1584, 1, 1), (1584, 1, 1, 1))
        buf110 = buf108[1]
        assert_size_stride(buf110, (132, 1584, 1, 1), (1584, 1, 1, 1))
        del buf108
        buf111 = empty_strided((132, 1584, 1, 1), (1584, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_15.run(buf110, buf111, 209088, grid=grid(209088), stream=stream0)
        del buf110
        buf114 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        buf115 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        buf117 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_16.run(buf96, convolution_142, buf109, convert_element_type_361, cat_37, unsqueeze_294, squeeze_157, buf114, buf115, buf117, 1584, 6272, grid=grid(1584), stream=stream0)
        buf116 = buf47; del buf47  # reuse
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_17.run(buf96, convolution_142, buf109, convert_element_type_361, cat_37, unsqueeze_294, buf115, squeeze_157, buf114, buf116, 9934848, grid=grid(9934848), stream=stream0)
        del cat_37
        del convert_element_type_361
        del convolution_142
        del unsqueeze_294
        buf118 = buf51; del buf51  # reuse
        triton_poi_fused_convolution_backward_18.run(buf116, squeeze_157, primals_105, buf118, 2483712, grid=grid(2483712), stream=stream0)
        buf119 = aten.convolution_backward(buf118, getitem_399, convert_element_type_359, [0], [1, 1], [4, 4], [1, 1], False, [0, 0], 396, [True, True, False])
        del convert_element_type_359
        del getitem_399
        buf120 = buf119[0]
        assert_size_stride(buf120, (128, 396, 7, 7), (19404, 49, 7, 1))
        buf121 = buf119[1]
        assert_size_stride(buf121, (396, 1, 9, 9), (81, 81, 9, 1))
        del buf119
        buf122 = empty_strided((396, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_19.run(buf121, buf122, 32076, grid=grid(32076), stream=stream0)
        del buf121
        buf123 = buf118; del buf118  # reuse
        triton_poi_fused_convolution_backward_20.run(buf116, squeeze_157, primals_105, buf123, 2483712, grid=grid(2483712), stream=stream0)
        buf124 = aten.convolution_backward(buf123, getitem_394, convert_element_type_358, [0], [1, 1], [3, 3], [1, 1], False, [0, 0], 396, [True, True, False])
        del convert_element_type_358
        del getitem_394
        buf125 = buf124[0]
        assert_size_stride(buf125, (128, 396, 7, 7), (19404, 49, 7, 1))
        buf126 = buf124[1]
        assert_size_stride(buf126, (396, 1, 7, 7), (49, 49, 7, 1))
        del buf124
        buf127 = empty_strided((396, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_21.run(buf126, buf127, 19404, grid=grid(19404), stream=stream0)
        del buf126
        buf128 = buf123; del buf123  # reuse
        triton_poi_fused_convolution_backward_22.run(buf116, squeeze_157, primals_105, buf128, 2483712, grid=grid(2483712), stream=stream0)
        buf129 = aten.convolution_backward(buf128, getitem_389, convert_element_type_357, [0], [1, 1], [2, 2], [1, 1], False, [0, 0], 396, [True, True, False])
        del convert_element_type_357
        del getitem_389
        buf130 = buf129[0]
        assert_size_stride(buf130, (128, 396, 7, 7), (19404, 49, 7, 1))
        buf131 = buf129[1]
        assert_size_stride(buf131, (396, 1, 5, 5), (25, 25, 5, 1))
        del buf129
        buf132 = empty_strided((396, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_23.run(buf131, buf132, 9900, grid=grid(9900), stream=stream0)
        del buf131
        buf133 = buf128; del buf128  # reuse
        triton_poi_fused_convolution_backward_24.run(buf116, squeeze_157, primals_105, buf133, 2483712, grid=grid(2483712), stream=stream0)
        del primals_105
        del squeeze_157
        buf134 = aten.convolution_backward(buf133, getitem_384, convert_element_type_356, [0], [1, 1], [1, 1], [1, 1], False, [0, 0], 396, [True, True, False])
        del buf133
        del convert_element_type_356
        del getitem_384
        buf135 = buf134[0]
        assert_size_stride(buf135, (128, 396, 7, 7), (19404, 49, 7, 1))
        buf136 = buf134[1]
        assert_size_stride(buf136, (396, 1, 3, 3), (9, 9, 3, 1))
        del buf134
        buf137 = empty_strided((396, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_25.run(buf136, buf137, 3564, grid=grid(3564), stream=stream0)
        del buf136
        buf142 = buf96; del buf96  # reuse
        buf138 = as_strided(buf142, (128, 396, 7, 7), (77616, 49, 7, 1))  # alias
        triton_poi_fused_cat_26.run(buf135, buf138, 2483712, grid=grid(2483712), stream=stream0)
        del buf135
        buf139 = as_strided(buf142, (128, 396, 7, 7), (77616, 49, 7, 1), 19404)  # alias
        triton_poi_fused_cat_27.run(buf130, buf139, 2483712, grid=grid(2483712), stream=stream0)
        del buf130
        buf140 = as_strided(buf142, (128, 396, 7, 7), (77616, 49, 7, 1), 38808)  # alias
        triton_poi_fused_cat_27.run(buf125, buf140, 2483712, grid=grid(2483712), stream=stream0)
        del buf125
        buf141 = as_strided(buf142, (128, 396, 7, 7), (77616, 49, 7, 1), 58212)  # alias
        triton_poi_fused_cat_27.run(buf120, buf141, 2483712, grid=grid(2483712), stream=stream0)
        buf143 = buf115; del buf115  # reuse
        buf144 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        buf145 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_28.run(buf142, mul_548, convolution_136, unsqueeze_306, squeeze_154, buf143, buf144, buf145, 1584, 6272, grid=grid(1584), stream=stream0)
        del buf138
        del buf139
        del buf140
        del buf141
        buf146 = buf142; del buf142  # reuse
        triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_29.run(buf146, mul_548, convolution_136, unsqueeze_306, buf144, squeeze_154, buf143, primals_103, 9934848, grid=grid(9934848), stream=stream0)
        del convolution_136
        del mul_548
        del primals_103
        del squeeze_154
        del unsqueeze_306
        buf147 = aten.convolution_backward(buf146, add_266, convert_element_type_351, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del add_266
        del convert_element_type_351
        buf148 = buf147[0]
        assert_size_stride(buf148, (128, 264, 7, 7), (12936, 49, 7, 1))
        buf149 = buf147[1]
        assert_size_stride(buf149, (1584, 264, 1, 1), (264, 1, 1, 1))
        del buf147
        buf150 = empty_strided((1584, 264, 1, 1), (264, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_30.run(buf149, buf150, 418176, grid=grid(418176), stream=stream0)
        del buf149
        buf151 = buf83; del buf83  # reuse
        buf152 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
        buf154 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_33.run(buf10, buf79, buf148, cat_36, unsqueeze_318, squeeze_151, buf151, buf152, buf154, 264, 6272, grid=grid(264), stream=stream0)
        buf153 = empty_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_34.run(buf10, buf79, buf148, cat_36, unsqueeze_318, buf152, squeeze_151, buf151, primals_101, buf153, 1655808, grid=grid(1655808), stream=stream0)
        del cat_36
        del primals_101
        del squeeze_151
        del unsqueeze_318
        buf155 = empty_strided((128, 132, 7, 7), (6468, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_convolution_backward_35.run(buf153, buf155, 827904, grid=grid(827904), stream=stream0)
        buf156 = aten.convolution_backward(buf155, getitem_375, convert_element_type_348, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del convert_element_type_348
        del getitem_375
        buf157 = buf156[0]
        assert_size_stride(buf157, (128, 792, 7, 7), (38808, 49, 7, 1))
        buf158 = buf156[1]
        assert_size_stride(buf158, (132, 792, 1, 1), (792, 1, 1, 1))
        del buf156
        buf159 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_7.run(buf158, buf159, 104544, grid=grid(104544), stream=stream0)
        del buf158
        buf160 = buf155; del buf155  # reuse
        triton_poi_fused_convolution_backward_36.run(buf153, buf160, 827904, grid=grid(827904), stream=stream0)
        del buf153
        buf161 = aten.convolution_backward(buf160, getitem_374, convert_element_type_347, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del buf160
        del convert_element_type_347
        del getitem_374
        buf162 = buf161[0]
        assert_size_stride(buf162, (128, 792, 7, 7), (38808, 49, 7, 1))
        buf163 = buf161[1]
        assert_size_stride(buf163, (132, 792, 1, 1), (792, 1, 1, 1))
        del buf161
        buf164 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_7.run(buf163, buf164, 104544, grid=grid(104544), stream=stream0)
        del buf163
        buf167 = buf146; del buf146  # reuse
        buf165 = as_strided(buf167, (128, 792, 7, 7), (77616, 49, 7, 1))  # alias
        triton_poi_fused_cat_8.run(buf162, buf165, 4967424, grid=grid(4967424), stream=stream0)
        del buf162
        buf166 = as_strided(buf167, (128, 792, 7, 7), (77616, 49, 7, 1), 38808)  # alias
        triton_poi_fused_cat_9.run(buf157, buf166, 4967424, grid=grid(4967424), stream=stream0)
        del buf157
        buf168 = as_strided(buf109, (128, 1584, 1, 1), (1584, 1, 202752, 202752)); del buf109  # reuse
        buf169 = as_strided(buf168, (128, 1584, 1, 1), (1584, 1, 1, 1)); del buf168  # reuse
        triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_10.run(buf169, buf167, convert_element_type_338, convolution_133, 202752, 49, grid=grid(202752), stream=stream0)
        del buf165
        del buf166
        buf175 = buf144; del buf144  # reuse
        triton_per_fused__to_copy_convolution_backward_11.run(buf169, buf175, 1584, 128, grid=grid(1584), stream=stream0)
        buf171 = aten.convolution_backward(buf169, convert_element_type_344, convert_element_type_346, [1584], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del buf169
        del convert_element_type_344
        del convert_element_type_346
        buf172 = buf171[0]
        assert_size_stride(buf172, (128, 132, 1, 1), (132, 1, 1, 1))
        buf173 = buf171[1]
        assert_size_stride(buf173, (1584, 132, 1, 1), (132, 1, 1, 1))
        del buf171
        buf174 = empty_strided((1584, 132, 1, 1), (132, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_12.run(buf173, buf174, 209088, grid=grid(209088), stream=stream0)
        del buf173
        buf177 = buf172; del buf172  # reuse
        triton_poi_fused_add_clone_fill_mul_sigmoid_sub_13.run(buf177, convolution_132, 16896, grid=grid(16896), stream=stream0)
        del convolution_132
        buf183 = empty_strided((132, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__to_copy_convolution_backward_14.run(buf177, buf183, 132, 128, grid=grid(132), stream=stream0)
        buf179 = aten.convolution_backward(buf177, mean_13, convert_element_type_342, [132], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del buf177
        del convert_element_type_342
        del mean_13
        buf180 = buf179[0]
        assert_size_stride(buf180, (128, 1584, 1, 1), (1584, 1, 1, 1))
        buf181 = buf179[1]
        assert_size_stride(buf181, (132, 1584, 1, 1), (1584, 1, 1, 1))
        del buf179
        buf182 = empty_strided((132, 1584, 1, 1), (1584, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_15.run(buf181, buf182, 209088, grid=grid(209088), stream=stream0)
        del buf181
        buf185 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        buf186 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        buf188 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_16.run(buf167, convolution_133, buf180, convert_element_type_338, cat_35, unsqueeze_330, squeeze_148, buf185, buf186, buf188, 1584, 6272, grid=grid(1584), stream=stream0)
        buf187 = buf116; del buf116  # reuse
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_17.run(buf167, convolution_133, buf180, convert_element_type_338, cat_35, unsqueeze_330, buf186, squeeze_148, buf185, buf187, 9934848, grid=grid(9934848), stream=stream0)
        del buf180
        del cat_35
        del convert_element_type_338
        del convolution_133
        del unsqueeze_330
        buf189 = buf120; del buf120  # reuse
        triton_poi_fused_convolution_backward_18.run(buf187, squeeze_148, primals_99, buf189, 2483712, grid=grid(2483712), stream=stream0)
        buf190 = aten.convolution_backward(buf189, getitem_371, convert_element_type_336, [0], [1, 1], [4, 4], [1, 1], False, [0, 0], 396, [True, True, False])
        del convert_element_type_336
        del getitem_371
        buf191 = buf190[0]
        assert_size_stride(buf191, (128, 396, 7, 7), (19404, 49, 7, 1))
        buf192 = buf190[1]
        assert_size_stride(buf192, (396, 1, 9, 9), (81, 81, 9, 1))
        del buf190
        buf193 = empty_strided((396, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_19.run(buf192, buf193, 32076, grid=grid(32076), stream=stream0)
        del buf192
        buf194 = buf189; del buf189  # reuse
        triton_poi_fused_convolution_backward_20.run(buf187, squeeze_148, primals_99, buf194, 2483712, grid=grid(2483712), stream=stream0)
        buf195 = aten.convolution_backward(buf194, getitem_366, convert_element_type_335, [0], [1, 1], [3, 3], [1, 1], False, [0, 0], 396, [True, True, False])
        del convert_element_type_335
        del getitem_366
        buf196 = buf195[0]
        assert_size_stride(buf196, (128, 396, 7, 7), (19404, 49, 7, 1))
        buf197 = buf195[1]
        assert_size_stride(buf197, (396, 1, 7, 7), (49, 49, 7, 1))
        del buf195
        buf198 = empty_strided((396, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_21.run(buf197, buf198, 19404, grid=grid(19404), stream=stream0)
        del buf197
        buf199 = buf194; del buf194  # reuse
        triton_poi_fused_convolution_backward_22.run(buf187, squeeze_148, primals_99, buf199, 2483712, grid=grid(2483712), stream=stream0)
        buf200 = aten.convolution_backward(buf199, getitem_361, convert_element_type_334, [0], [1, 1], [2, 2], [1, 1], False, [0, 0], 396, [True, True, False])
        del convert_element_type_334
        del getitem_361
        buf201 = buf200[0]
        assert_size_stride(buf201, (128, 396, 7, 7), (19404, 49, 7, 1))
        buf202 = buf200[1]
        assert_size_stride(buf202, (396, 1, 5, 5), (25, 25, 5, 1))
        del buf200
        buf203 = empty_strided((396, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_23.run(buf202, buf203, 9900, grid=grid(9900), stream=stream0)
        del buf202
        buf204 = buf199; del buf199  # reuse
        triton_poi_fused_convolution_backward_24.run(buf187, squeeze_148, primals_99, buf204, 2483712, grid=grid(2483712), stream=stream0)
        del buf187
        del primals_99
        del squeeze_148
        buf205 = aten.convolution_backward(buf204, getitem_356, convert_element_type_333, [0], [1, 1], [1, 1], [1, 1], False, [0, 0], 396, [True, True, False])
        del buf204
        del convert_element_type_333
        del getitem_356
        buf206 = buf205[0]
        assert_size_stride(buf206, (128, 396, 7, 7), (19404, 49, 7, 1))
        buf207 = buf205[1]
        assert_size_stride(buf207, (396, 1, 3, 3), (9, 9, 3, 1))
        del buf205
        buf208 = empty_strided((396, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_25.run(buf207, buf208, 3564, grid=grid(3564), stream=stream0)
        del buf207
        buf213 = buf167; del buf167  # reuse
        buf209 = as_strided(buf213, (128, 396, 7, 7), (77616, 49, 7, 1))  # alias
        triton_poi_fused_cat_26.run(buf206, buf209, 2483712, grid=grid(2483712), stream=stream0)
        del buf206
        buf210 = as_strided(buf213, (128, 396, 7, 7), (77616, 49, 7, 1), 19404)  # alias
        triton_poi_fused_cat_27.run(buf201, buf210, 2483712, grid=grid(2483712), stream=stream0)
        del buf201
        buf211 = as_strided(buf213, (128, 396, 7, 7), (77616, 49, 7, 1), 38808)  # alias
        triton_poi_fused_cat_27.run(buf196, buf211, 2483712, grid=grid(2483712), stream=stream0)
        del buf196
        buf212 = as_strided(buf213, (128, 396, 7, 7), (77616, 49, 7, 1), 58212)  # alias
        triton_poi_fused_cat_27.run(buf191, buf212, 2483712, grid=grid(2483712), stream=stream0)
        del buf191
        buf214 = buf186; del buf186  # reuse
        buf215 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        buf216 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_28.run(buf213, mul_588, convolution_127, unsqueeze_342, squeeze_145, buf214, buf215, buf216, 1584, 6272, grid=grid(1584), stream=stream0)
        del buf209
        del buf210
        del buf211
        del buf212
        buf217 = buf213; del buf213  # reuse
        triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_29.run(buf217, mul_588, convolution_127, unsqueeze_342, buf215, squeeze_145, buf214, primals_97, 9934848, grid=grid(9934848), stream=stream0)
        del buf215
        del convolution_127
        del mul_588
        del primals_97
        del squeeze_145
        del unsqueeze_342
        buf218 = aten.convolution_backward(buf217, convert_element_type_327, convert_element_type_328, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del buf217
        del convert_element_type_327
        del convert_element_type_328
        buf219 = buf218[0]
        assert_size_stride(buf219, (128, 264, 7, 7), (12936, 49, 7, 1))
        buf220 = buf218[1]
        assert_size_stride(buf220, (1584, 264, 1, 1), (264, 1, 1, 1))
        del buf218
        buf221 = empty_strided((1584, 264, 1, 1), (264, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_30.run(buf220, buf221, 418176, grid=grid(418176), stream=stream0)
        del buf220
        buf222 = buf152; del buf152  # reuse
        buf223 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
        buf225 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_37.run(buf10, buf79, buf148, buf219, convolution_126, unsqueeze_354, squeeze_142, buf222, buf223, buf225, 264, 6272, grid=grid(264), stream=stream0)
        buf226 = buf85; del buf85  # reuse
        triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_38.run(buf10, buf79, buf148, buf219, convolution_126, unsqueeze_354, buf223, squeeze_142, buf222, primals_95, buf226, 1655808, grid=grid(1655808), stream=stream0)
        del buf10
        del buf148
        del buf219
        del buf223
        del buf79
        del convolution_126
        del primals_95
        del squeeze_142
        del unsqueeze_354
        buf227 = aten.convolution_backward(buf226, mul_380, convert_element_type_325, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del buf226
        del convert_element_type_325
        del mul_380
        buf228 = buf227[0]
        assert_size_stride(buf228, (128, 960, 7, 7), (47040, 49, 7, 1))
        buf229 = buf227[1]
        assert_size_stride(buf229, (264, 960, 1, 1), (960, 1, 1, 1))
        del buf227
        buf230 = empty_strided((264, 960, 1, 1), (960, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_39.run(buf229, buf230, 253440, grid=grid(253440), stream=stream0)
        del buf229
        buf231 = empty_strided((128, 960, 1, 1), (960, 1, 122880, 122880), device='cuda', dtype=torch.float16)
        buf232 = as_strided(buf231, (128, 960, 1, 1), (960, 1, 1, 1)); del buf231  # reuse
        triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_40.run(buf232, buf228, convert_element_type_316, convolution_125, 122880, 49, grid=grid(122880), stream=stream0)
        buf238 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__to_copy_convolution_backward_41.run(buf232, buf238, 960, 128, grid=grid(960), stream=stream0)
        buf234 = aten.convolution_backward(buf232, convert_element_type_322, convert_element_type_324, [960], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del buf232
        del convert_element_type_322
        del convert_element_type_324
        buf235 = buf234[0]
        assert_size_stride(buf235, (128, 80, 1, 1), (80, 1, 1, 1))
        buf236 = buf234[1]
        assert_size_stride(buf236, (960, 80, 1, 1), (80, 1, 1, 1))
        del buf234
        buf237 = empty_strided((960, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_42.run(buf236, buf237, 76800, grid=grid(76800), stream=stream0)
        del buf236
        buf240 = buf235; del buf235  # reuse
        triton_poi_fused_add_clone_fill_mul_sigmoid_sub_43.run(buf240, convolution_124, 10240, grid=grid(10240), stream=stream0)
        del convolution_124
        buf246 = empty_strided((80, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__to_copy_convolution_backward_44.run(buf240, buf246, 80, 128, grid=grid(80), stream=stream0)
        buf242 = aten.convolution_backward(buf240, mean_12, convert_element_type_320, [80], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del buf240
        del convert_element_type_320
        del mean_12
        buf243 = buf242[0]
        assert_size_stride(buf243, (128, 960, 1, 1), (960, 1, 1, 1))
        buf244 = buf242[1]
        assert_size_stride(buf244, (80, 960, 1, 1), (960, 1, 1, 1))
        del buf242
        buf245 = empty_strided((80, 960, 1, 1), (960, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_45.run(buf244, buf245, 76800, grid=grid(76800), stream=stream0)
        del buf244
        buf248 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32)
        buf249 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32)
        buf251 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_46.run(buf228, convolution_125, buf243, convert_element_type_316, cat_34, unsqueeze_366, squeeze_139, buf248, buf249, buf251, 960, 6272, grid=grid(960), stream=stream0)
        buf250 = empty_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_47.run(buf228, convolution_125, buf243, convert_element_type_316, cat_34, unsqueeze_366, buf249, squeeze_139, buf248, buf250, 6021120, grid=grid(6021120), stream=stream0)
        del buf228
        del buf243
        del cat_34
        del convert_element_type_316
        del convolution_125
        del unsqueeze_366
        buf252 = empty_strided((128, 240, 7, 7), (11760, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_convolution_backward_48.run(buf250, squeeze_139, primals_93, buf252, 1505280, grid=grid(1505280), stream=stream0)
        buf253 = aten.convolution_backward(buf252, getitem_345, convert_element_type_314, [0], [2, 2], [4, 4], [1, 1], False, [0, 0], 240, [True, True, False])
        del convert_element_type_314
        del getitem_345
        buf254 = buf253[0]
        assert_size_stride(buf254, (128, 240, 14, 14), (47040, 196, 14, 1))
        buf255 = buf253[1]
        assert_size_stride(buf255, (240, 1, 9, 9), (81, 81, 9, 1))
        del buf253
        buf256 = empty_strided((240, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_49.run(buf255, buf256, 19440, grid=grid(19440), stream=stream0)
        del buf255
        buf257 = buf252; del buf252  # reuse
        triton_poi_fused_convolution_backward_50.run(buf250, squeeze_139, primals_93, buf257, 1505280, grid=grid(1505280), stream=stream0)
        buf258 = aten.convolution_backward(buf257, getitem_340, convert_element_type_313, [0], [2, 2], [3, 3], [1, 1], False, [0, 0], 240, [True, True, False])
        del convert_element_type_313
        del getitem_340
        buf259 = buf258[0]
        assert_size_stride(buf259, (128, 240, 14, 14), (47040, 196, 14, 1))
        buf260 = buf258[1]
        assert_size_stride(buf260, (240, 1, 7, 7), (49, 49, 7, 1))
        del buf258
        buf261 = empty_strided((240, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_51.run(buf260, buf261, 11760, grid=grid(11760), stream=stream0)
        del buf260
        buf262 = buf257; del buf257  # reuse
        triton_poi_fused_convolution_backward_52.run(buf250, squeeze_139, primals_93, buf262, 1505280, grid=grid(1505280), stream=stream0)
        buf263 = aten.convolution_backward(buf262, getitem_335, convert_element_type_312, [0], [2, 2], [2, 2], [1, 1], False, [0, 0], 240, [True, True, False])
        del convert_element_type_312
        del getitem_335
        buf264 = buf263[0]
        assert_size_stride(buf264, (128, 240, 14, 14), (47040, 196, 14, 1))
        buf265 = buf263[1]
        assert_size_stride(buf265, (240, 1, 5, 5), (25, 25, 5, 1))
        del buf263
        buf266 = empty_strided((240, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_53.run(buf265, buf266, 6000, grid=grid(6000), stream=stream0)
        del buf265
        buf267 = buf262; del buf262  # reuse
        triton_poi_fused_convolution_backward_54.run(buf250, squeeze_139, primals_93, buf267, 1505280, grid=grid(1505280), stream=stream0)
        del buf250
        del primals_93
        del squeeze_139
        buf268 = aten.convolution_backward(buf267, getitem_330, convert_element_type_311, [0], [2, 2], [1, 1], [1, 1], False, [0, 0], 240, [True, True, False])
        del buf267
        del convert_element_type_311
        del getitem_330
        buf269 = buf268[0]
        assert_size_stride(buf269, (128, 240, 14, 14), (47040, 196, 14, 1))
        buf270 = buf268[1]
        assert_size_stride(buf270, (240, 1, 3, 3), (9, 9, 3, 1))
        del buf268
        buf271 = empty_strided((240, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_55.run(buf270, buf271, 2160, grid=grid(2160), stream=stream0)
        del buf270
        buf276 = empty_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda', dtype=torch.float16)
        buf272 = as_strided(buf276, (128, 240, 14, 14), (188160, 196, 14, 1))  # alias
        triton_poi_fused_cat_56.run(buf269, buf272, 6021120, grid=grid(6021120), stream=stream0)
        del buf269
        buf273 = as_strided(buf276, (128, 240, 14, 14), (188160, 196, 14, 1), 47040)  # alias
        triton_poi_fused_cat_56.run(buf264, buf273, 6021120, grid=grid(6021120), stream=stream0)
        del buf264
        buf274 = as_strided(buf276, (128, 240, 14, 14), (188160, 196, 14, 1), 94080)  # alias
        triton_poi_fused_cat_56.run(buf259, buf274, 6021120, grid=grid(6021120), stream=stream0)
        del buf259
        buf275 = as_strided(buf276, (128, 240, 14, 14), (188160, 196, 14, 1), 141120)  # alias
        triton_poi_fused_cat_56.run(buf254, buf275, 6021120, grid=grid(6021120), stream=stream0)
        del buf254
        buf277 = buf249; del buf249  # reuse
        buf278 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32)
        buf279 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_57.run(buf276, mul_628, convolution_119, unsqueeze_378, squeeze_136, buf277, buf278, buf279, 960, 25088, grid=grid(960), stream=stream0)
        del buf272
        del buf273
        del buf274
        del buf275
        buf280 = buf276; del buf276  # reuse
        triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_58.run(buf280, mul_628, convolution_119, unsqueeze_378, buf278, squeeze_136, buf277, primals_91, 24084480, grid=grid(24084480), stream=stream0)
        del buf278
        del convolution_119
        del mul_628
        del primals_91
        del squeeze_136
        del unsqueeze_378
        buf281 = aten.convolution_backward(buf280, add_235, convert_element_type_306, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del add_235
        del buf280
        del convert_element_type_306
        buf282 = buf281[0]
        assert_size_stride(buf282, (128, 160, 14, 14), (31360, 196, 14, 1))
        buf283 = buf281[1]
        assert_size_stride(buf283, (960, 160, 1, 1), (160, 1, 1, 1))
        del buf281
        buf284 = empty_strided((960, 160, 1, 1), (160, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_59.run(buf283, buf284, 153600, grid=grid(153600), stream=stream0)
        del buf283
        buf285 = empty_strided((160, 4), (1, 160), device='cuda', dtype=torch.float32)
        buf287 = empty_strided((160, 4), (1, 160), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_60.run(buf282, cat_33, unsqueeze_390, buf285, buf287, 640, 6272, grid=grid(640), stream=stream0)
        buf286 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused_native_batch_norm_backward_61.run(buf285, buf286, 160, 4, grid=grid(160), stream=stream0)
        buf288 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
        buf289 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_62.run(buf287, squeeze_133, buf288, buf289, 160, 4, grid=grid(160), stream=stream0)
        buf290 = empty_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_63.run(buf282, cat_33, unsqueeze_390, buf288, squeeze_133, buf286, primals_89, buf290, 4014080, grid=grid(4014080), stream=stream0)
        del cat_33
        del primals_89
        del squeeze_133
        del unsqueeze_390
        buf291 = aten.convolution_backward(as_strided(buf290, (128, 80, 14, 14), (31360, 196, 14, 1), 15680), getitem_321, convert_element_type_303, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del convert_element_type_303
        del getitem_321
        buf292 = buf291[0]
        assert_size_stride(buf292, (128, 240, 14, 14), (47040, 196, 14, 1))
        buf293 = buf291[1]
        assert_size_stride(buf293, (80, 240, 1, 1), (240, 1, 1, 1))
        del buf291
        buf294 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_64.run(buf293, buf294, 19200, grid=grid(19200), stream=stream0)
        del buf293
        buf295 = aten.convolution_backward(as_strided(buf290, (128, 80, 14, 14), (31360, 196, 14, 1)), getitem_320, convert_element_type_302, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del convert_element_type_302
        del getitem_320
        buf296 = buf295[0]
        assert_size_stride(buf296, (128, 240, 14, 14), (47040, 196, 14, 1))
        buf297 = buf295[1]
        assert_size_stride(buf297, (80, 240, 1, 1), (240, 1, 1, 1))
        del buf295
        buf298 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_64.run(buf297, buf298, 19200, grid=grid(19200), stream=stream0)
        del buf297
        buf301 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16)
        buf299 = as_strided(buf301, (128, 240, 14, 14), (94080, 196, 14, 1))  # alias
        triton_poi_fused_cat_65.run(buf296, buf299, 6021120, grid=grid(6021120), stream=stream0)
        del buf296
        buf300 = as_strided(buf301, (128, 240, 14, 14), (94080, 196, 14, 1), 47040)  # alias
        triton_poi_fused_cat_65.run(buf292, buf300, 6021120, grid=grid(6021120), stream=stream0)
        del buf292
        buf302 = empty_strided((128, 480, 1, 1), (480, 1, 61440, 61440), device='cuda', dtype=torch.float16)
        buf303 = as_strided(buf302, (128, 480, 1, 1), (480, 1, 1, 1)); del buf302  # reuse
        triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_66.run(buf303, buf301, convert_element_type_293, convolution_116, 61440, 196, grid=grid(61440), stream=stream0)
        del buf299
        del buf300
        buf309 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__to_copy_convolution_backward_67.run(buf303, buf309, 480, 128, grid=grid(480), stream=stream0)
        buf305 = aten.convolution_backward(buf303, convert_element_type_299, convert_element_type_301, [480], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del buf303
        del convert_element_type_299
        del convert_element_type_301
        buf306 = buf305[0]
        assert_size_stride(buf306, (128, 80, 1, 1), (80, 1, 1, 1))
        buf307 = buf305[1]
        assert_size_stride(buf307, (480, 80, 1, 1), (80, 1, 1, 1))
        del buf305
        buf308 = empty_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_68.run(buf307, buf308, 38400, grid=grid(38400), stream=stream0)
        del buf307
        buf311 = buf306; del buf306  # reuse
        triton_poi_fused_add_clone_fill_mul_sigmoid_sub_43.run(buf311, convolution_115, 10240, grid=grid(10240), stream=stream0)
        del convolution_115
        buf317 = empty_strided((80, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__to_copy_convolution_backward_44.run(buf311, buf317, 80, 128, grid=grid(80), stream=stream0)
        buf313 = aten.convolution_backward(buf311, mean_11, convert_element_type_297, [80], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del buf311
        del convert_element_type_297
        del mean_11
        buf314 = buf313[0]
        assert_size_stride(buf314, (128, 480, 1, 1), (480, 1, 1, 1))
        buf315 = buf313[1]
        assert_size_stride(buf315, (80, 480, 1, 1), (480, 1, 1, 1))
        del buf313
        buf316 = empty_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_69.run(buf315, buf316, 38400, grid=grid(38400), stream=stream0)
        del buf315
        buf319 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        buf320 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        buf322 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_70.run(buf301, convolution_116, buf314, convert_element_type_293, cat_32, unsqueeze_402, squeeze_130, buf319, buf320, buf322, 480, 25088, grid=grid(480), stream=stream0)
        buf321 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_71.run(buf301, convolution_116, buf314, convert_element_type_293, cat_32, unsqueeze_402, buf320, squeeze_130, buf319, buf321, 12042240, grid=grid(12042240), stream=stream0)
        del cat_32
        del convert_element_type_293
        del convolution_116
        del unsqueeze_402
        buf323 = empty_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_convolution_backward_72.run(buf321, squeeze_130, primals_87, buf323, 3010560, grid=grid(3010560), stream=stream0)
        buf324 = aten.convolution_backward(buf323, getitem_317, convert_element_type_291, [0], [1, 1], [4, 4], [1, 1], False, [0, 0], 120, [True, True, False])
        del convert_element_type_291
        del getitem_317
        buf325 = buf324[0]
        assert_size_stride(buf325, (128, 120, 14, 14), (23520, 196, 14, 1))
        buf326 = buf324[1]
        assert_size_stride(buf326, (120, 1, 9, 9), (81, 81, 9, 1))
        del buf324
        buf327 = empty_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_73.run(buf326, buf327, 9720, grid=grid(9720), stream=stream0)
        del buf326
        buf328 = buf323; del buf323  # reuse
        triton_poi_fused_convolution_backward_74.run(buf321, squeeze_130, primals_87, buf328, 3010560, grid=grid(3010560), stream=stream0)
        buf329 = aten.convolution_backward(buf328, getitem_312, convert_element_type_290, [0], [1, 1], [3, 3], [1, 1], False, [0, 0], 120, [True, True, False])
        del convert_element_type_290
        del getitem_312
        buf330 = buf329[0]
        assert_size_stride(buf330, (128, 120, 14, 14), (23520, 196, 14, 1))
        buf331 = buf329[1]
        assert_size_stride(buf331, (120, 1, 7, 7), (49, 49, 7, 1))
        del buf329
        buf332 = empty_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_75.run(buf331, buf332, 5880, grid=grid(5880), stream=stream0)
        del buf331
        buf333 = buf328; del buf328  # reuse
        triton_poi_fused_convolution_backward_76.run(buf321, squeeze_130, primals_87, buf333, 3010560, grid=grid(3010560), stream=stream0)
        buf334 = aten.convolution_backward(buf333, getitem_307, convert_element_type_289, [0], [1, 1], [2, 2], [1, 1], False, [0, 0], 120, [True, True, False])
        del convert_element_type_289
        del getitem_307
        buf335 = buf334[0]
        assert_size_stride(buf335, (128, 120, 14, 14), (23520, 196, 14, 1))
        buf336 = buf334[1]
        assert_size_stride(buf336, (120, 1, 5, 5), (25, 25, 5, 1))
        del buf334
        buf337 = empty_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_77.run(buf336, buf337, 3000, grid=grid(3000), stream=stream0)
        del buf336
        buf338 = buf333; del buf333  # reuse
        triton_poi_fused_convolution_backward_78.run(buf321, squeeze_130, primals_87, buf338, 3010560, grid=grid(3010560), stream=stream0)
        del primals_87
        del squeeze_130
        buf339 = aten.convolution_backward(buf338, getitem_302, convert_element_type_288, [0], [1, 1], [1, 1], [1, 1], False, [0, 0], 120, [True, True, False])
        del buf338
        del convert_element_type_288
        del getitem_302
        buf340 = buf339[0]
        assert_size_stride(buf340, (128, 120, 14, 14), (23520, 196, 14, 1))
        buf341 = buf339[1]
        assert_size_stride(buf341, (120, 1, 3, 3), (9, 9, 3, 1))
        del buf339
        buf342 = empty_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_79.run(buf341, buf342, 1080, grid=grid(1080), stream=stream0)
        del buf341
        buf347 = buf301; del buf301  # reuse
        buf343 = as_strided(buf347, (128, 120, 14, 14), (94080, 196, 14, 1))  # alias
        triton_poi_fused_cat_80.run(buf340, buf343, 3010560, grid=grid(3010560), stream=stream0)
        del buf340
        buf344 = as_strided(buf347, (128, 120, 14, 14), (94080, 196, 14, 1), 23520)  # alias
        triton_poi_fused_cat_80.run(buf335, buf344, 3010560, grid=grid(3010560), stream=stream0)
        del buf335
        buf345 = as_strided(buf347, (128, 120, 14, 14), (94080, 196, 14, 1), 47040)  # alias
        triton_poi_fused_cat_80.run(buf330, buf345, 3010560, grid=grid(3010560), stream=stream0)
        del buf330
        buf346 = as_strided(buf347, (128, 120, 14, 14), (94080, 196, 14, 1), 70560)  # alias
        triton_poi_fused_cat_80.run(buf325, buf346, 3010560, grid=grid(3010560), stream=stream0)
        buf348 = buf320; del buf320  # reuse
        buf349 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        buf350 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_81.run(buf347, mul_668, cat_31, unsqueeze_414, squeeze_127, buf348, buf349, buf350, 480, 25088, grid=grid(480), stream=stream0)
        del buf343
        del buf344
        del buf345
        del buf346
        buf351 = buf347; del buf347  # reuse
        triton_poi_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_82.run(buf351, mul_668, cat_31, unsqueeze_414, buf349, squeeze_127, buf348, primals_85, 12042240, grid=grid(12042240), stream=stream0)
        del cat_31
        del mul_668
        del primals_85
        del squeeze_127
        del unsqueeze_414
        buf352 = aten.convolution_backward(as_strided(buf351, (128, 240, 14, 14), (94080, 196, 14, 1), 47040), getitem_295, convert_element_type_283, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del convert_element_type_283
        del getitem_295
        buf353 = buf352[0]
        assert_size_stride(buf353, (128, 80, 14, 14), (15680, 196, 14, 1))
        buf354 = buf352[1]
        assert_size_stride(buf354, (240, 80, 1, 1), (80, 1, 1, 1))
        del buf352
        buf355 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_83.run(buf354, buf355, 19200, grid=grid(19200), stream=stream0)
        del buf354
        buf356 = aten.convolution_backward(as_strided(buf351, (128, 240, 14, 14), (94080, 196, 14, 1)), getitem_294, convert_element_type_282, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del convert_element_type_282
        del getitem_294
        buf357 = buf356[0]
        assert_size_stride(buf357, (128, 80, 14, 14), (15680, 196, 14, 1))
        buf358 = buf356[1]
        assert_size_stride(buf358, (240, 80, 1, 1), (80, 1, 1, 1))
        del buf356
        buf359 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_83.run(buf358, buf359, 19200, grid=grid(19200), stream=stream0)
        del buf358
        buf362 = buf290; del buf290  # reuse
        buf360 = as_strided(buf362, (128, 80, 14, 14), (31360, 196, 14, 1))  # alias
        triton_poi_fused_cat_84.run(buf357, buf360, 2007040, grid=grid(2007040), stream=stream0)
        del buf357
        buf361 = as_strided(buf362, (128, 80, 14, 14), (31360, 196, 14, 1), 15680)  # alias
        triton_poi_fused_cat_84.run(buf353, buf361, 2007040, grid=grid(2007040), stream=stream0)
        del buf353
        buf363 = buf287; del buf287  # reuse
        buf365 = buf285; del buf285  # reuse
        triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_85.run(buf282, buf362, cat_30, unsqueeze_426, buf363, buf365, 640, 6272, grid=grid(640), stream=stream0)
        del buf360
        del buf361
        buf364 = buf288; del buf288  # reuse
        triton_per_fused_native_batch_norm_backward_61.run(buf363, buf364, 160, 4, grid=grid(160), stream=stream0)
        buf366 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
        buf367 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_62.run(buf365, squeeze_124, buf366, buf367, 160, 4, grid=grid(160), stream=stream0)
        buf368 = empty_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_86.run(buf282, buf362, cat_30, unsqueeze_426, buf366, squeeze_124, buf364, primals_83, buf368, 4014080, grid=grid(4014080), stream=stream0)
        del cat_30
        del primals_83
        del squeeze_124
        del unsqueeze_426
        buf369 = aten.convolution_backward(as_strided(buf368, (128, 80, 14, 14), (31360, 196, 14, 1), 15680), getitem_291, convert_element_type_279, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del convert_element_type_279
        del getitem_291
        buf370 = buf369[0]
        assert_size_stride(buf370, (128, 240, 14, 14), (47040, 196, 14, 1))
        buf371 = buf369[1]
        assert_size_stride(buf371, (80, 240, 1, 1), (240, 1, 1, 1))
        del buf369
        buf372 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_64.run(buf371, buf372, 19200, grid=grid(19200), stream=stream0)
        del buf371
        buf373 = aten.convolution_backward(as_strided(buf368, (128, 80, 14, 14), (31360, 196, 14, 1)), getitem_290, convert_element_type_278, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del convert_element_type_278
        del getitem_290
        buf374 = buf373[0]
        assert_size_stride(buf374, (128, 240, 14, 14), (47040, 196, 14, 1))
        buf375 = buf373[1]
        assert_size_stride(buf375, (80, 240, 1, 1), (240, 1, 1, 1))
        del buf373
        buf376 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_64.run(buf375, buf376, 19200, grid=grid(19200), stream=stream0)
        del buf375
        buf379 = buf351; del buf351  # reuse
        buf377 = as_strided(buf379, (128, 240, 14, 14), (94080, 196, 14, 1))  # alias
        triton_poi_fused_cat_65.run(buf374, buf377, 6021120, grid=grid(6021120), stream=stream0)
        del buf374
        buf378 = as_strided(buf379, (128, 240, 14, 14), (94080, 196, 14, 1), 47040)  # alias
        triton_poi_fused_cat_65.run(buf370, buf378, 6021120, grid=grid(6021120), stream=stream0)
        del buf370
        buf380 = as_strided(buf314, (128, 480, 1, 1), (480, 1, 61440, 61440)); del buf314  # reuse
        buf381 = as_strided(buf380, (128, 480, 1, 1), (480, 1, 1, 1)); del buf380  # reuse
        triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_66.run(buf381, buf379, convert_element_type_269, convolution_106, 61440, 196, grid=grid(61440), stream=stream0)
        del buf377
        del buf378
        buf387 = buf349; del buf349  # reuse
        triton_per_fused__to_copy_convolution_backward_67.run(buf381, buf387, 480, 128, grid=grid(480), stream=stream0)
        buf383 = aten.convolution_backward(buf381, convert_element_type_275, convert_element_type_277, [480], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del buf381
        del convert_element_type_275
        del convert_element_type_277
        buf384 = buf383[0]
        assert_size_stride(buf384, (128, 80, 1, 1), (80, 1, 1, 1))
        buf385 = buf383[1]
        assert_size_stride(buf385, (480, 80, 1, 1), (80, 1, 1, 1))
        del buf383
        buf386 = empty_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_68.run(buf385, buf386, 38400, grid=grid(38400), stream=stream0)
        del buf385
        buf389 = buf384; del buf384  # reuse
        triton_poi_fused_add_clone_fill_mul_sigmoid_sub_43.run(buf389, convolution_105, 10240, grid=grid(10240), stream=stream0)
        del convolution_105
        buf395 = empty_strided((80, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__to_copy_convolution_backward_44.run(buf389, buf395, 80, 128, grid=grid(80), stream=stream0)
        buf391 = aten.convolution_backward(buf389, mean_10, convert_element_type_273, [80], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del buf389
        del convert_element_type_273
        del mean_10
        buf392 = buf391[0]
        assert_size_stride(buf392, (128, 480, 1, 1), (480, 1, 1, 1))
        buf393 = buf391[1]
        assert_size_stride(buf393, (80, 480, 1, 1), (480, 1, 1, 1))
        del buf391
        buf394 = empty_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_69.run(buf393, buf394, 38400, grid=grid(38400), stream=stream0)
        del buf393
        buf397 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        buf398 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        buf400 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_70.run(buf379, convolution_106, buf392, convert_element_type_269, cat_29, unsqueeze_438, squeeze_121, buf397, buf398, buf400, 480, 25088, grid=grid(480), stream=stream0)
        buf399 = buf321; del buf321  # reuse
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_71.run(buf379, convolution_106, buf392, convert_element_type_269, cat_29, unsqueeze_438, buf398, squeeze_121, buf397, buf399, 12042240, grid=grid(12042240), stream=stream0)
        del cat_29
        del convert_element_type_269
        del convolution_106
        del unsqueeze_438
        buf401 = buf325; del buf325  # reuse
        triton_poi_fused_convolution_backward_72.run(buf399, squeeze_121, primals_81, buf401, 3010560, grid=grid(3010560), stream=stream0)
        buf402 = aten.convolution_backward(buf401, getitem_287, convert_element_type_267, [0], [1, 1], [4, 4], [1, 1], False, [0, 0], 120, [True, True, False])
        del convert_element_type_267
        del getitem_287
        buf403 = buf402[0]
        assert_size_stride(buf403, (128, 120, 14, 14), (23520, 196, 14, 1))
        buf404 = buf402[1]
        assert_size_stride(buf404, (120, 1, 9, 9), (81, 81, 9, 1))
        del buf402
        buf405 = empty_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_73.run(buf404, buf405, 9720, grid=grid(9720), stream=stream0)
        del buf404
        buf406 = buf401; del buf401  # reuse
        triton_poi_fused_convolution_backward_74.run(buf399, squeeze_121, primals_81, buf406, 3010560, grid=grid(3010560), stream=stream0)
        buf407 = aten.convolution_backward(buf406, getitem_282, convert_element_type_266, [0], [1, 1], [3, 3], [1, 1], False, [0, 0], 120, [True, True, False])
        del convert_element_type_266
        del getitem_282
        buf408 = buf407[0]
        assert_size_stride(buf408, (128, 120, 14, 14), (23520, 196, 14, 1))
        buf409 = buf407[1]
        assert_size_stride(buf409, (120, 1, 7, 7), (49, 49, 7, 1))
        del buf407
        buf410 = empty_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_75.run(buf409, buf410, 5880, grid=grid(5880), stream=stream0)
        del buf409
        buf411 = buf406; del buf406  # reuse
        triton_poi_fused_convolution_backward_76.run(buf399, squeeze_121, primals_81, buf411, 3010560, grid=grid(3010560), stream=stream0)
        buf412 = aten.convolution_backward(buf411, getitem_277, convert_element_type_265, [0], [1, 1], [2, 2], [1, 1], False, [0, 0], 120, [True, True, False])
        del convert_element_type_265
        del getitem_277
        buf413 = buf412[0]
        assert_size_stride(buf413, (128, 120, 14, 14), (23520, 196, 14, 1))
        buf414 = buf412[1]
        assert_size_stride(buf414, (120, 1, 5, 5), (25, 25, 5, 1))
        del buf412
        buf415 = empty_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_77.run(buf414, buf415, 3000, grid=grid(3000), stream=stream0)
        del buf414
        buf416 = buf411; del buf411  # reuse
        triton_poi_fused_convolution_backward_78.run(buf399, squeeze_121, primals_81, buf416, 3010560, grid=grid(3010560), stream=stream0)
        del primals_81
        del squeeze_121
        buf417 = aten.convolution_backward(buf416, getitem_272, convert_element_type_264, [0], [1, 1], [1, 1], [1, 1], False, [0, 0], 120, [True, True, False])
        del buf416
        del convert_element_type_264
        del getitem_272
        buf418 = buf417[0]
        assert_size_stride(buf418, (128, 120, 14, 14), (23520, 196, 14, 1))
        buf419 = buf417[1]
        assert_size_stride(buf419, (120, 1, 3, 3), (9, 9, 3, 1))
        del buf417
        buf420 = empty_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_79.run(buf419, buf420, 1080, grid=grid(1080), stream=stream0)
        del buf419
        buf425 = buf379; del buf379  # reuse
        buf421 = as_strided(buf425, (128, 120, 14, 14), (94080, 196, 14, 1))  # alias
        triton_poi_fused_cat_80.run(buf418, buf421, 3010560, grid=grid(3010560), stream=stream0)
        del buf418
        buf422 = as_strided(buf425, (128, 120, 14, 14), (94080, 196, 14, 1), 23520)  # alias
        triton_poi_fused_cat_80.run(buf413, buf422, 3010560, grid=grid(3010560), stream=stream0)
        del buf413
        buf423 = as_strided(buf425, (128, 120, 14, 14), (94080, 196, 14, 1), 47040)  # alias
        triton_poi_fused_cat_80.run(buf408, buf423, 3010560, grid=grid(3010560), stream=stream0)
        del buf408
        buf424 = as_strided(buf425, (128, 120, 14, 14), (94080, 196, 14, 1), 70560)  # alias
        triton_poi_fused_cat_80.run(buf403, buf424, 3010560, grid=grid(3010560), stream=stream0)
        buf426 = buf398; del buf398  # reuse
        buf427 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        buf428 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_81.run(buf425, mul_708, cat_28, unsqueeze_450, squeeze_118, buf426, buf427, buf428, 480, 25088, grid=grid(480), stream=stream0)
        del buf421
        del buf422
        del buf423
        del buf424
        buf429 = buf425; del buf425  # reuse
        triton_poi_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_82.run(buf429, mul_708, cat_28, unsqueeze_450, buf427, squeeze_118, buf426, primals_79, 12042240, grid=grid(12042240), stream=stream0)
        del cat_28
        del mul_708
        del primals_79
        del squeeze_118
        del unsqueeze_450
        buf430 = aten.convolution_backward(as_strided(buf429, (128, 240, 14, 14), (94080, 196, 14, 1), 47040), getitem_265, convert_element_type_259, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del convert_element_type_259
        del getitem_265
        buf431 = buf430[0]
        assert_size_stride(buf431, (128, 80, 14, 14), (15680, 196, 14, 1))
        buf432 = buf430[1]
        assert_size_stride(buf432, (240, 80, 1, 1), (80, 1, 1, 1))
        del buf430
        buf433 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_83.run(buf432, buf433, 19200, grid=grid(19200), stream=stream0)
        del buf432
        buf434 = aten.convolution_backward(as_strided(buf429, (128, 240, 14, 14), (94080, 196, 14, 1)), getitem_264, convert_element_type_258, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del convert_element_type_258
        del getitem_264
        buf435 = buf434[0]
        assert_size_stride(buf435, (128, 80, 14, 14), (15680, 196, 14, 1))
        buf436 = buf434[1]
        assert_size_stride(buf436, (240, 80, 1, 1), (80, 1, 1, 1))
        del buf434
        buf437 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_83.run(buf436, buf437, 19200, grid=grid(19200), stream=stream0)
        del buf436
        buf440 = buf368; del buf368  # reuse
        buf438 = as_strided(buf440, (128, 80, 14, 14), (31360, 196, 14, 1))  # alias
        triton_poi_fused_cat_84.run(buf435, buf438, 2007040, grid=grid(2007040), stream=stream0)
        del buf435
        buf439 = as_strided(buf440, (128, 80, 14, 14), (31360, 196, 14, 1), 15680)  # alias
        triton_poi_fused_cat_84.run(buf431, buf439, 2007040, grid=grid(2007040), stream=stream0)
        buf441 = buf365; del buf365  # reuse
        buf443 = buf363; del buf363  # reuse
        triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_87.run(buf282, buf362, buf440, cat_27, unsqueeze_462, buf441, buf443, 640, 6272, grid=grid(640), stream=stream0)
        del buf438
        del buf439
        buf442 = buf366; del buf366  # reuse
        triton_per_fused_native_batch_norm_backward_61.run(buf441, buf442, 160, 4, grid=grid(160), stream=stream0)
        buf444 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
        buf446 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_62.run(buf443, squeeze_115, buf444, buf446, 160, 4, grid=grid(160), stream=stream0)
        buf445 = empty_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_88.run(buf282, buf362, buf440, cat_27, unsqueeze_462, buf444, squeeze_115, buf442, primals_77, buf445, 4014080, grid=grid(4014080), stream=stream0)
        del cat_27
        del primals_77
        del squeeze_115
        del unsqueeze_462
        buf447 = buf431; del buf431  # reuse
        triton_poi_fused_convolution_backward_89.run(buf445, buf447, 2007040, grid=grid(2007040), stream=stream0)
        buf448 = aten.convolution_backward(buf447, getitem_261, convert_element_type_255, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del convert_element_type_255
        del getitem_261
        buf449 = buf448[0]
        assert_size_stride(buf449, (128, 240, 14, 14), (47040, 196, 14, 1))
        buf450 = buf448[1]
        assert_size_stride(buf450, (80, 240, 1, 1), (240, 1, 1, 1))
        del buf448
        buf451 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_64.run(buf450, buf451, 19200, grid=grid(19200), stream=stream0)
        del buf450
        buf452 = buf447; del buf447  # reuse
        triton_poi_fused_convolution_backward_90.run(buf445, buf452, 2007040, grid=grid(2007040), stream=stream0)
        del buf445
        buf453 = aten.convolution_backward(buf452, getitem_260, convert_element_type_254, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del buf452
        del convert_element_type_254
        del getitem_260
        buf454 = buf453[0]
        assert_size_stride(buf454, (128, 240, 14, 14), (47040, 196, 14, 1))
        buf455 = buf453[1]
        assert_size_stride(buf455, (80, 240, 1, 1), (240, 1, 1, 1))
        del buf453
        buf456 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_64.run(buf455, buf456, 19200, grid=grid(19200), stream=stream0)
        del buf455
        buf459 = buf429; del buf429  # reuse
        buf457 = as_strided(buf459, (128, 240, 14, 14), (94080, 196, 14, 1))  # alias
        triton_poi_fused_cat_65.run(buf454, buf457, 6021120, grid=grid(6021120), stream=stream0)
        del buf454
        buf458 = as_strided(buf459, (128, 240, 14, 14), (94080, 196, 14, 1), 47040)  # alias
        triton_poi_fused_cat_65.run(buf449, buf458, 6021120, grid=grid(6021120), stream=stream0)
        buf460 = as_strided(buf392, (128, 480, 1, 1), (480, 1, 61440, 61440)); del buf392  # reuse
        buf461 = as_strided(buf460, (128, 480, 1, 1), (480, 1, 1, 1)); del buf460  # reuse
        triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_66.run(buf461, buf459, convert_element_type_245, convolution_96, 61440, 196, grid=grid(61440), stream=stream0)
        del buf457
        del buf458
        buf467 = buf427; del buf427  # reuse
        triton_per_fused__to_copy_convolution_backward_67.run(buf461, buf467, 480, 128, grid=grid(480), stream=stream0)
        buf463 = aten.convolution_backward(buf461, convert_element_type_251, convert_element_type_253, [480], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del buf461
        del convert_element_type_251
        del convert_element_type_253
        buf464 = buf463[0]
        assert_size_stride(buf464, (128, 80, 1, 1), (80, 1, 1, 1))
        buf465 = buf463[1]
        assert_size_stride(buf465, (480, 80, 1, 1), (80, 1, 1, 1))
        del buf463
        buf466 = empty_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_68.run(buf465, buf466, 38400, grid=grid(38400), stream=stream0)
        del buf465
        buf469 = buf464; del buf464  # reuse
        triton_poi_fused_add_clone_fill_mul_sigmoid_sub_43.run(buf469, convolution_95, 10240, grid=grid(10240), stream=stream0)
        del convolution_95
        buf475 = empty_strided((80, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__to_copy_convolution_backward_44.run(buf469, buf475, 80, 128, grid=grid(80), stream=stream0)
        buf471 = aten.convolution_backward(buf469, mean_9, convert_element_type_249, [80], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del buf469
        del convert_element_type_249
        del mean_9
        buf472 = buf471[0]
        assert_size_stride(buf472, (128, 480, 1, 1), (480, 1, 1, 1))
        buf473 = buf471[1]
        assert_size_stride(buf473, (80, 480, 1, 1), (480, 1, 1, 1))
        del buf471
        buf474 = empty_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_69.run(buf473, buf474, 38400, grid=grid(38400), stream=stream0)
        del buf473
        buf477 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        buf478 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        buf480 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_70.run(buf459, convolution_96, buf472, convert_element_type_245, cat_26, unsqueeze_474, squeeze_112, buf477, buf478, buf480, 480, 25088, grid=grid(480), stream=stream0)
        buf479 = buf399; del buf399  # reuse
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_div_fill_mul_native_batch_norm_backward_sigmoid_sub_71.run(buf459, convolution_96, buf472, convert_element_type_245, cat_26, unsqueeze_474, buf478, squeeze_112, buf477, buf479, 12042240, grid=grid(12042240), stream=stream0)
        del buf472
        del cat_26
        del convert_element_type_245
        del convolution_96
        del unsqueeze_474
        buf481 = buf403; del buf403  # reuse
        triton_poi_fused_convolution_backward_72.run(buf479, squeeze_112, primals_75, buf481, 3010560, grid=grid(3010560), stream=stream0)
        buf482 = aten.convolution_backward(buf481, getitem_257, convert_element_type_243, [0], [1, 1], [4, 4], [1, 1], False, [0, 0], 120, [True, True, False])
        del convert_element_type_243
        del getitem_257
        buf483 = buf482[0]
        assert_size_stride(buf483, (128, 120, 14, 14), (23520, 196, 14, 1))
        buf484 = buf482[1]
        assert_size_stride(buf484, (120, 1, 9, 9), (81, 81, 9, 1))
        del buf482
        buf485 = empty_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_73.run(buf484, buf485, 9720, grid=grid(9720), stream=stream0)
        del buf484
        buf486 = buf481; del buf481  # reuse
        triton_poi_fused_convolution_backward_74.run(buf479, squeeze_112, primals_75, buf486, 3010560, grid=grid(3010560), stream=stream0)
        buf487 = aten.convolution_backward(buf486, getitem_252, convert_element_type_242, [0], [1, 1], [3, 3], [1, 1], False, [0, 0], 120, [True, True, False])
        del convert_element_type_242
        del getitem_252
        buf488 = buf487[0]
        assert_size_stride(buf488, (128, 120, 14, 14), (23520, 196, 14, 1))
        buf489 = buf487[1]
        assert_size_stride(buf489, (120, 1, 7, 7), (49, 49, 7, 1))
        del buf487
        buf490 = empty_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_75.run(buf489, buf490, 5880, grid=grid(5880), stream=stream0)
        del buf489
        buf491 = buf486; del buf486  # reuse
        triton_poi_fused_convolution_backward_76.run(buf479, squeeze_112, primals_75, buf491, 3010560, grid=grid(3010560), stream=stream0)
        buf492 = aten.convolution_backward(buf491, getitem_247, convert_element_type_241, [0], [1, 1], [2, 2], [1, 1], False, [0, 0], 120, [True, True, False])
        del convert_element_type_241
        del getitem_247
        buf493 = buf492[0]
        assert_size_stride(buf493, (128, 120, 14, 14), (23520, 196, 14, 1))
        buf494 = buf492[1]
        assert_size_stride(buf494, (120, 1, 5, 5), (25, 25, 5, 1))
        del buf492
        buf495 = empty_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_77.run(buf494, buf495, 3000, grid=grid(3000), stream=stream0)
        del buf494
        buf496 = buf491; del buf491  # reuse
        triton_poi_fused_convolution_backward_78.run(buf479, squeeze_112, primals_75, buf496, 3010560, grid=grid(3010560), stream=stream0)
        del buf479
        del primals_75
        del squeeze_112
        buf497 = aten.convolution_backward(buf496, getitem_242, convert_element_type_240, [0], [1, 1], [1, 1], [1, 1], False, [0, 0], 120, [True, True, False])
        del buf496
        del convert_element_type_240
        del getitem_242
        buf498 = buf497[0]
        assert_size_stride(buf498, (128, 120, 14, 14), (23520, 196, 14, 1))
        buf499 = buf497[1]
        assert_size_stride(buf499, (120, 1, 3, 3), (9, 9, 3, 1))
        del buf497
        buf500 = empty_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_79.run(buf499, buf500, 1080, grid=grid(1080), stream=stream0)
        del buf499
        buf505 = buf459; del buf459  # reuse
        buf501 = as_strided(buf505, (128, 120, 14, 14), (94080, 196, 14, 1))  # alias
        triton_poi_fused_cat_80.run(buf498, buf501, 3010560, grid=grid(3010560), stream=stream0)
        del buf498
        buf502 = as_strided(buf505, (128, 120, 14, 14), (94080, 196, 14, 1), 23520)  # alias
        triton_poi_fused_cat_80.run(buf493, buf502, 3010560, grid=grid(3010560), stream=stream0)
        del buf493
        buf503 = as_strided(buf505, (128, 120, 14, 14), (94080, 196, 14, 1), 47040)  # alias
        triton_poi_fused_cat_80.run(buf488, buf503, 3010560, grid=grid(3010560), stream=stream0)
        del buf488
        buf504 = as_strided(buf505, (128, 120, 14, 14), (94080, 196, 14, 1), 70560)  # alias
        triton_poi_fused_cat_80.run(buf483, buf504, 3010560, grid=grid(3010560), stream=stream0)
        del buf483
        buf506 = buf478; del buf478  # reuse
        buf507 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        buf508 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_81.run(buf505, mul_748, cat_25, unsqueeze_486, squeeze_109, buf506, buf507, buf508, 480, 25088, grid=grid(480), stream=stream0)
        del buf501
        del buf502
        del buf503
        del buf504
        buf509 = buf505; del buf505  # reuse
        triton_poi_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_82.run(buf509, mul_748, cat_25, unsqueeze_486, buf507, squeeze_109, buf506, primals_73, 12042240, grid=grid(12042240), stream=stream0)
        del cat_25
        del mul_748
        del primals_73
        del squeeze_109
        del unsqueeze_486
        buf510 = aten.convolution_backward(as_strided(buf509, (128, 240, 14, 14), (94080, 196, 14, 1), 47040), getitem_235, convert_element_type_235, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del convert_element_type_235
        del getitem_235
        buf511 = buf510[0]
        assert_size_stride(buf511, (128, 80, 14, 14), (15680, 196, 14, 1))
        buf512 = buf510[1]
        assert_size_stride(buf512, (240, 80, 1, 1), (80, 1, 1, 1))
        del buf510
        buf513 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_83.run(buf512, buf513, 19200, grid=grid(19200), stream=stream0)
        del buf512
        buf514 = aten.convolution_backward(as_strided(buf509, (128, 240, 14, 14), (94080, 196, 14, 1)), getitem_234, convert_element_type_234, [0], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False])
        del buf509
        del convert_element_type_234
        del getitem_234
        buf515 = buf514[0]
        assert_size_stride(buf515, (128, 80, 14, 14), (15680, 196, 14, 1))
        buf516 = buf514[1]
        assert_size_stride(buf516, (240, 80, 1, 1), (80, 1, 1, 1))
        del buf514
        buf517 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float32)
        triton_poi_fused__to_copy_83.run(buf516, buf517, 19200, grid=grid(19200), stream=stream0)
        del buf516
        buf520 = empty_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda', dtype=torch.float16)
        buf518 = as_strided(buf520, (128, 80, 14, 14), (31360, 196, 14, 1))  # alias
        triton_poi_fused_cat_84.run(buf515, buf518, 2007040, grid=grid(2007040), stream=stream0)
        del buf515
        buf519 = as_strided(buf520, (128, 80, 14, 14), (31360, 196, 14, 1