shunting314/cqz7hvhood7y3psp7fy6msjxsxyli7qiwiybizdwtjw6ffyq5wwd.py

## cqz7hvhood7y3psp7fy6msjxsxyli7qiwiybizdwtjw6ffyq5wwd.py

from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from torch._inductor.utils import maybe_profile

from torch import empty_strided, as_strided, device
from torch._inductor.codecache import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels

aten = torch.ops.aten
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
async_compile = AsyncCompile()

import triton
import triton.language as tl
from torch._inductor.triton_heuristics import grid, start_graph, end_graph
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream


# kernel path: /tmp/torchinductor_shunting/cd/ccd7accawcy3jndrehd4geosuzoh6rxkzjlqnojxb5msgrcv7udi.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type
triton_poi_fused__to_copy_0 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 864
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((32, 3, 3, 3), (27, 9, 3, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((32, 3, 3, 3), (27, 9, 3, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_0.run(*args, 864, grid=grid(864), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_0.benchmark_all_configs(*args, 864, grid=grid(864))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/4z/c4zfgpaaduweifmtujob2ufcjjkrkqa7mb5j2qaaysmstief35xb.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_1
triton_poi_fused__to_copy_1 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_1(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 19267584
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), None)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None)


def get_args():
    arg_0 = rand_strided((128, 3, 224, 224), (150528, 50176, 224, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((128, 3, 224, 224), (150528, 50176, 224, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_1.run(*args, 19267584, grid=grid(19267584), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_1.benchmark_all_configs(*args, 19267584, grid=grid(19267584))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/4v/c4vmg7ssybcn2qvzgu3tqbkt62uopduyelvkcwxaqv2fc4glkb57.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => convert_element_type_2, var_mean
triton_red_fused__native_batch_norm_legit_functional_2 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[512, 131072],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_2(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 448
    rnumel = 114688
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex % 32
    x1 = (xindex // 32)
    _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = tl.load(in_ptr0 + ((112*(((r2 + (114688*x1)) // 112) % 112)) + (12544*x0) + (401408*((r2 + (114688*x1)) // 12544)) + (r2 % 112)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
    tmp2 = tl.sum(_tmp2, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp2, xmask)


def get_args():
    arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 32, 1, 1, 14), (448, 1, 448, 448, 32), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_2.run(*args, 448, 114688, grid=grid(448), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_2.benchmark_all_configs(*args, 448, 114688, grid=grid(448))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ax/caxxuenolffrdcs4fes5eiayfgrbnuyzkml277uyriuhejqsq7k4.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_2, convert_element_type_2, mul_1, mul_2, var_mean
triton_per_fused__native_batch_norm_legit_functional_3 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[32, 16],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_3(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 32
    rnumel = 14
    RBLOCK: tl.constexpr = 16
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (32*r1)), rmask & xmask, other=0)
    tmp8 = tl.load(in_ptr1 + (x0), xmask)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = 1605632.0
    tmp5 = tmp3 / tmp4
    tmp6 = 0.1
    tmp7 = tmp5 * tmp6
    tmp9 = 0.9
    tmp10 = tmp8 * tmp9
    tmp11 = tmp7 + tmp10
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask)


def get_args():
    arg_0 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((1, 32, 1, 1, 14), (448, 1, 448, 448, 32), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_3.run(*args, 32, 14, grid=grid(32), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_3.benchmark_all_configs(*args, 32, 14, grid=grid(32))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/4s/c4sugoluux54xogsh3dgakggor26tpyllwcenrcu3hdxyyol5rhg.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => convert_element_type_2, var_mean
triton_red_fused__native_batch_norm_legit_functional_4 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[512, 131072],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_4(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 448
    rnumel = 114688
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex % 32
    x1 = (xindex // 32)
    tmp2 = tl.load(in_ptr1 + (x0), xmask)
    _tmp5 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = tl.load(in_ptr0 + ((112*(((r2 + (114688*x1)) // 112) % 112)) + (12544*x0) + (401408*((r2 + (114688*x1)) // 12544)) + (r2 % 112)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        tmp3 = tmp1 - tmp2
        tmp4 = tmp3 * tmp3
        _tmp5 = tl.where(rmask & xmask, _tmp5 + tmp4, _tmp5)
    tmp5 = tl.sum(_tmp5, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp5, xmask)


def get_args():
    arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 32, 1, 1, 14), (448, 1, 448, 448, 32), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_4.run(*args, 448, 114688, grid=grid(448), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_4.benchmark_all_configs(*args, 448, 114688, grid=grid(448))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/yz/cyzfzxq5m7mzh26evqlnxbs2f4624mxxzlbzx5v335w7yujg6hp5.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_1, add_3, convert_element_type_2, mul_3, mul_4, mul_5, rsqrt, squeeze_1, var_mean
triton_per_fused__native_batch_norm_legit_functional_5 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[32, 16],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_5(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 32
    rnumel = 14
    RBLOCK: tl.constexpr = 16
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (32*r1)), rmask & xmask, other=0)
    tmp13 = tl.load(in_ptr1 + (x0), xmask)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = 1605632.0
    tmp5 = tmp3 / tmp4
    tmp6 = 1e-05
    tmp7 = tmp5 + tmp6
    tmp8 = tl.math.rsqrt(tmp7)
    tmp9 = 1.0000006228081046
    tmp10 = tmp5 * tmp9
    tmp11 = 0.1
    tmp12 = tmp10 * tmp11
    tmp14 = 0.9
    tmp15 = tmp13 * tmp14
    tmp16 = tmp12 + tmp15
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask)
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask)
    tl.store(out_ptr0 + x0, tmp3, xmask)


def get_args():
    arg_0 = rand_strided((1, 32, 1, 1, 14), (448, 1, 448, 448, 32), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_5.run(*args, 32, 14, grid=grid(32), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_5.benchmark_all_configs(*args, 32, 14, grid=grid(32))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/tp/ctp3rh47jmesldsykk2bkq62s2n65nizu7jrip6s664llvntz6jk.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.relu

# aten._native_batch_norm_legit_functional => add_1, add_4, convert_element_type_2, convert_element_type_3, mul, mul_6, rsqrt, sub, var_mean
# aten.relu => relu
triton_poi_fused__native_batch_norm_legit_functional_relu_6 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_relu_6(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 51380224
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 12544) % 32
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x1), None)
    tmp4 = tl.load(in_ptr2 + (x1), None)
    tmp11 = tl.load(in_ptr3 + (x1), None)
    tmp13 = tl.load(in_ptr4 + (x1), None)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 1605632.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tmp16 = tl.where(0 != 0, 0, tl.where(0 > tmp15, 0, tmp15))
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp16, None)


def get_args():
    arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_relu_6.run(*args, 51380224, grid=grid(51380224), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_relu_6.benchmark_all_configs(*args, 51380224, grid=grid(51380224))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/bt/cbt6gouhluogn74n5bu4s6rx7uqiiw6lwhnqaggedwz4abkhco2b.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_4
triton_poi_fused__to_copy_7 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[512], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_7(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 288
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((32, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((32, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_7.run(*args, 288, grid=grid(288), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_7.benchmark_all_configs(*args, 288, grid=grid(288))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/tq/ctqzpnsuriwiat75pazkds4p5xe54pkhlftlkhx4o3jngunwgzub.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_7
triton_poi_fused__to_copy_8 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_8(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1024
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((32, 32, 1, 1), (32, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((32, 32, 1, 1), (32, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_8.run(*args, 1024, grid=grid(1024), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_8.benchmark_all_configs(*args, 1024, grid=grid(1024))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ma/cma4u56mzuidifjz56xk54ie6sgj6png7qekm2tx4cs7sle553zr.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add

# aten._native_batch_norm_legit_functional => add_11, add_14, convert_element_type_8, convert_element_type_9, mul_14, mul_20, rsqrt_2, sub_2, var_mean_2
# aten.add => add_15
triton_poi_fused__native_batch_norm_legit_functional_add_9 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_9(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 51380224
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 12544) % 32
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x1), None)
    tmp4 = tl.load(in_ptr2 + (x1), None)
    tmp11 = tl.load(in_ptr3 + (x1), None)
    tmp13 = tl.load(in_ptr4 + (x1), None)
    tmp16 = tl.load(in_ptr5 + (x3), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 1605632.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tmp17 = tmp15 + tmp16
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp17, None)


def get_args():
    arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_6 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_9.run(*args, 51380224, grid=grid(51380224), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_9.benchmark_all_configs(*args, 51380224, grid=grid(51380224))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/mf/cmfq2hg7iduys7x4z6irkzuujylw3e7gj4ysdrcbo5cemukrg3t5.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_6
triton_poi_fused_split_with_sizes_10 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_10(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 25690112
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 200704
    x1 = (xindex // 200704)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (401408*x1)), None).to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 16, 112, 112), (200704, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_10.run(*args, 25690112, grid=grid(25690112), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_10.benchmark_all_configs(*args, 25690112, grid=grid(25690112))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/lr/clrdqz4vueql2no5zrrsw4bgmmsgoov4oautycbtlc4q5ypfrpxe.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_7
triton_poi_fused_split_with_sizes_11 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_11(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 25690112
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 200704
    x1 = (xindex // 200704)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (200704 + x0 + (401408*x1)), None).to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 16, 112, 112), (200704, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_11.run(*args, 25690112, grid=grid(25690112), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_11.benchmark_all_configs(*args, 25690112, grid=grid(25690112))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/wk/cwkoecpvylkd2zd4p2qzrsxvdjpmlhjy2vqltsena3lm7tbpxnqv.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_10
triton_poi_fused__to_copy_12 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_12(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1536
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((96, 16, 1, 1), (16, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((96, 16, 1, 1), (16, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_12.run(*args, 1536, grid=grid(1536), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_12.benchmark_all_configs(*args, 1536, grid=grid(1536))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/yj/cyjbfwxbzmq4snbyqnu5ehgb3xenlxtbixoczmyjqmdjbuu22kqa.py
# Original ATen: aten.cat

# aten.cat => cat
triton_poi_fused_cat_13 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[268435456], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_13(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 154140672
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 1204224
    x1 = (xindex // 1204224)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (2408448*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 96, 112, 112), (1204224, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 96, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_13.run(*args, 154140672, grid=grid(154140672), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_13.benchmark_all_configs(*args, 154140672, grid=grid(154140672))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/nh/cnh3nbcj2c5nraypyukh7gkg7ocvsbjrgj7ok7wc472nt34wy65s.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => convert_element_type_12, var_mean_3
triton_red_fused__native_batch_norm_legit_functional_14 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[4096, 131072],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_14(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 2496
    rnumel = 123511
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x1 = (xindex // 192)
    x0 = xindex % 192
    _tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = r2 + (123511*x1)
        tmp1 = 1605632
        tmp2 = tmp0 < tmp1
        tmp3 = tl.load(in_ptr0 + ((12544*x0) + (2408448*(((r2 + (123511*x1)) // 12544) % 128)) + ((r2 + (123511*x1)) % 12544) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp4 = tmp3.to(tl.float32)
        tmp5 = tl.where(tmp2, tmp4, 0)
        _tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6)
    tmp6 = tl.sum(_tmp6, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp6, xmask)


def get_args():
    arg_0 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 192, 1, 1, 13), (2496, 1, 2496, 2496, 192), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_14.run(*args, 2496, 123511, grid=grid(2496), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_14.benchmark_all_configs(*args, 2496, 123511, grid=grid(2496))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ed/cedbp3dhcc6xvctcve537pwhmiwi2ebf5oi5f5bppm5whslbp34w.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_18, convert_element_type_12, mul_22, mul_23, var_mean_3
triton_per_fused__native_batch_norm_legit_functional_15 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[256, 16],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_15(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 192
    rnumel = 13
    RBLOCK: tl.constexpr = 16
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (192*r1)), rmask & xmask, other=0)
    tmp8 = tl.load(in_ptr1 + (x0), xmask)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = 1605632.0
    tmp5 = tmp3 / tmp4
    tmp6 = 0.1
    tmp7 = tmp5 * tmp6
    tmp9 = 0.9
    tmp10 = tmp8 * tmp9
    tmp11 = tmp7 + tmp10
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask)


def get_args():
    arg_0 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((1, 192, 1, 1, 13), (2496, 1, 2496, 2496, 192), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_15.run(*args, 192, 13, grid=grid(192), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_15.benchmark_all_configs(*args, 192, 13, grid=grid(192))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/jk/cjk2vm3446xrk7rth7hr6pun7xxo3dnzubwcn6ydrpifal4eykrz.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => convert_element_type_12, var_mean_3
triton_red_fused__native_batch_norm_legit_functional_16 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[4096, 131072],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_16(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 2496
    rnumel = 123511
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x1 = (xindex // 192)
    x0 = xindex % 192
    _tmp9 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = r2 + (123511*x1)
        tmp1 = 1605632
        tmp2 = tmp0 < tmp1
        tmp3 = tl.load(in_ptr0 + ((12544*x0) + (2408448*(((r2 + (123511*x1)) // 12544) % 128)) + ((r2 + (123511*x1)) % 12544) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp4 = tmp3.to(tl.float32)
        tmp5 = tl.load(in_ptr1 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0)
        tmp6 = tmp4 - tmp5
        tmp7 = tmp6 * tmp6
        tmp8 = tl.where(tmp2, tmp7, 0)
        _tmp9 = tl.where(rmask & xmask, _tmp9 + tmp8, _tmp9)
    tmp9 = tl.sum(_tmp9, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp9, xmask)


def get_args():
    arg_0 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 192, 1, 1, 13), (2496, 1, 2496, 2496, 192), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_16.run(*args, 2496, 123511, grid=grid(2496), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_16.benchmark_all_configs(*args, 2496, 123511, grid=grid(2496))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/r2/cr2l4iahxzbm4xda53lhc7nels7ttppa4wfso67ohv6lwu4ut2ei.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_17, add_19, convert_element_type_12, mul_24, mul_25, mul_26, rsqrt_3, squeeze_10, var_mean_3
triton_per_fused__native_batch_norm_legit_functional_17 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[256, 16],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_17(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 192
    rnumel = 13
    RBLOCK: tl.constexpr = 16
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (192*r1)), rmask & xmask, other=0)
    tmp13 = tl.load(in_ptr1 + (x0), xmask)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = 1605632.0
    tmp5 = tmp3 / tmp4
    tmp6 = 1e-05
    tmp7 = tmp5 + tmp6
    tmp8 = tl.math.rsqrt(tmp7)
    tmp9 = 1.0000006228081046
    tmp10 = tmp5 * tmp9
    tmp11 = 0.1
    tmp12 = tmp10 * tmp11
    tmp14 = 0.9
    tmp15 = tmp13 * tmp14
    tmp16 = tmp12 + tmp15
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask)
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask)
    tl.store(out_ptr0 + x0, tmp3, xmask)


def get_args():
    arg_0 = rand_strided((1, 192, 1, 1, 13), (2496, 1, 2496, 2496, 192), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_17.run(*args, 192, 13, grid=grid(192), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_17.benchmark_all_configs(*args, 192, 13, grid=grid(192))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/7k/c7k35ubu25j3ryiaxnuo2fjojvhqvy67tox3wtaa4knc5l4vzqby.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.relu, aten.threshold_backward

# aten._native_batch_norm_legit_functional => add_17, add_20, convert_element_type_12, convert_element_type_13, mul_21, mul_27, rsqrt_3, sub_3, var_mean_3
# aten.relu => relu_2
# aten.threshold_backward => le_4
triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_18 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[536870912], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*i1', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_18(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
    xnumel = 308281344
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 12544) % 192
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x1), None)
    tmp4 = tl.load(in_ptr2 + (x1), None)
    tmp11 = tl.load(in_ptr3 + (x1), None)
    tmp13 = tl.load(in_ptr4 + (x1), None)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 1605632.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tmp16 = tl.where(0 != 0, 0, tl.where(0 > tmp15, 0, tmp15))
    tmp17 = 0.0
    tmp18 = tmp16 <= tmp17
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp16, None)
    tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp18, None)


def get_args():
    arg_0 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
    arg_6 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.bool)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_18.run(*args, 308281344, grid=grid(308281344), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_18.benchmark_all_configs(*args, 308281344, grid=grid(308281344))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/db/cdbk64vwqs4pzrtrb762fxgfklb7q573zm3neisxuhpcneczshvk.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_14
triton_poi_fused__to_copy_19 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_19(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 576
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((64, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((64, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_19.run(*args, 576, grid=grid(576), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_19.benchmark_all_configs(*args, 576, grid=grid(576))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/fq/cfq5oxsb4jr7dbzocvxxr77nzkg2jk4k3rsmhwdpzkoisxiavav5.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_15
triton_poi_fused__to_copy_20 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_20(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1600
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((64, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((64, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_20.run(*args, 1600, grid=grid(1600), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_20.benchmark_all_configs(*args, 1600, grid=grid(1600))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/fo/cfoj5fwz53k7xkfcu6z676snb5mum4wtda3diyiuxkkhrnlissnf.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_16
triton_poi_fused__to_copy_21 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_21(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3136
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((64, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((64, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_21.run(*args, 3136, grid=grid(3136), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_21.benchmark_all_configs(*args, 3136, grid=grid(3136))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ye/cyesscl4aqq7rzyjhll5ppohr4h424gcitducugsmauai5wnvgy5.py
# Original ATen: aten.cat

# aten.cat => cat_1
triton_poi_fused_cat_22 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_22(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 25690112
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 200704
    x1 = (xindex // 200704)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (602112*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 64, 56, 56), (200704, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 64, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_22.run(*args, 25690112, grid=grid(25690112), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_22.benchmark_all_configs(*args, 25690112, grid=grid(25690112))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/zm/czmjlzsknic276jj2ulhzekdx7suavkcesp75xqpnfbntqprzsrx.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => convert_element_type_17, var_mean_4
triton_red_fused__native_batch_norm_legit_functional_23 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[1024, 131072],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_23(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 768
    rnumel = 100352
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex % 192
    x1 = (xindex // 192)
    _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = tl.load(in_ptr0 + ((3136*x0) + (602112*(r2 // 3136)) + (19267584*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
    tmp2 = tl.sum(_tmp2, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp2, xmask)


def get_args():
    arg_0 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 192, 1, 1, 4), (768, 1, 768, 768, 192), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_23.run(*args, 768, 100352, grid=grid(768), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_23.benchmark_all_configs(*args, 768, 100352, grid=grid(768))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/zp/czpkjhwvqvlhuxsldhiwsf77gmyfxqbtysyyhp7lberdnzehm42l.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_23, convert_element_type_17, mul_29, mul_30, var_mean_4
triton_per_fused__native_batch_norm_legit_functional_24 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[256, 4],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_24(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 192
    rnumel = 4
    RBLOCK: tl.constexpr = 4
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (192*r1)), rmask & xmask, other=0)
    tmp8 = tl.load(in_ptr1 + (x0), xmask)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = 401408.0
    tmp5 = tmp3 / tmp4
    tmp6 = 0.1
    tmp7 = tmp5 * tmp6
    tmp9 = 0.9
    tmp10 = tmp8 * tmp9
    tmp11 = tmp7 + tmp10
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask)


def get_args():
    arg_0 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((1, 192, 1, 1, 4), (768, 1, 768, 768, 192), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_24.run(*args, 192, 4, grid=grid(192), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_24.benchmark_all_configs(*args, 192, 4, grid=grid(192))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/4o/c4ohubk3q7bkigrpxxijrzzlhs5bqi6qcn3fxomltgipjmowqgd2.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => convert_element_type_17, var_mean_4
triton_red_fused__native_batch_norm_legit_functional_25 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[1024, 131072],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_25(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 768
    rnumel = 100352
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex % 192
    x1 = (xindex // 192)
    tmp2 = tl.load(in_ptr1 + (x0), xmask)
    _tmp5 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = tl.load(in_ptr0 + ((3136*x0) + (602112*(r2 // 3136)) + (19267584*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        tmp3 = tmp1 - tmp2
        tmp4 = tmp3 * tmp3
        _tmp5 = tl.where(rmask & xmask, _tmp5 + tmp4, _tmp5)
    tmp5 = tl.sum(_tmp5, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp5, xmask)


def get_args():
    arg_0 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 192, 1, 1, 4), (768, 1, 768, 768, 192), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_25.run(*args, 768, 100352, grid=grid(768), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_25.benchmark_all_configs(*args, 768, 100352, grid=grid(768))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/kw/ckworjll2g6m5sbalty5hh27jhwuiyj3iqglt67oatdxz54d3igy.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_22, add_24, convert_element_type_17, mul_31, mul_32, mul_33, rsqrt_4, squeeze_13, var_mean_4
triton_per_fused__native_batch_norm_legit_functional_26 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[256, 4],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_26(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 192
    rnumel = 4
    RBLOCK: tl.constexpr = 4
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (192*r1)), rmask & xmask, other=0)
    tmp13 = tl.load(in_ptr1 + (x0), xmask)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = 401408.0
    tmp5 = tmp3 / tmp4
    tmp6 = 1e-05
    tmp7 = tmp5 + tmp6
    tmp8 = tl.math.rsqrt(tmp7)
    tmp9 = 1.0000024912370735
    tmp10 = tmp5 * tmp9
    tmp11 = 0.1
    tmp12 = tmp10 * tmp11
    tmp14 = 0.9
    tmp15 = tmp13 * tmp14
    tmp16 = tmp12 + tmp15
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask)
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask)
    tl.store(out_ptr0 + x0, tmp3, xmask)


def get_args():
    arg_0 = rand_strided((1, 192, 1, 1, 4), (768, 1, 768, 768, 192), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_26.run(*args, 192, 4, grid=grid(192), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_26.benchmark_all_configs(*args, 192, 4, grid=grid(192))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/rb/crbx6irdjcatfhmyhdhxdmxoun65x4lirw3usotlxxknzb3rnzo3.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.relu, aten.threshold_backward

# aten._native_batch_norm_legit_functional => add_22, add_25, convert_element_type_17, convert_element_type_18, mul_28, mul_34, rsqrt_4, sub_4, var_mean_4
# aten.relu => relu_3
# aten.threshold_backward => le_3
triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_27 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[134217728], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*i1', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_27(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
    xnumel = 77070336
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 3136) % 192
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x1), None)
    tmp4 = tl.load(in_ptr2 + (x1), None)
    tmp11 = tl.load(in_ptr3 + (x1), None)
    tmp13 = tl.load(in_ptr4 + (x1), None)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 401408.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tmp16 = tl.where(0 != 0, 0, tl.where(0 > tmp15, 0, tmp15))
    tmp17 = 0.0
    tmp18 = tmp16 <= tmp17
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp16, None)
    tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp18, None)


def get_args():
    arg_0 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_6 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.bool)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_27.run(*args, 77070336, grid=grid(77070336), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_27.benchmark_all_configs(*args, 77070336, grid=grid(77070336))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/yr/cyrrsvkikbwrkovp7p3kp54xrx3kv5k3cuej452rtgigaetgnkzf.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_19
triton_poi_fused__to_copy_28 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_28(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1920
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((20, 96, 1, 1), (96, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((20, 96, 1, 1), (96, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_28.run(*args, 1920, grid=grid(1920), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_28.benchmark_all_configs(*args, 1920, grid=grid(1920))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/lx/clxgqhjnos6ucboracnh3kmrmn5mwuc3sgkffihrftfhuwl64wg2.py
# Original ATen: aten.cat

# aten.cat => cat_2
triton_poi_fused_cat_29 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_29(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 8028160
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 62720
    x1 = (xindex // 62720)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (125440*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 20, 56, 56), (62720, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 20, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_29.run(*args, 8028160, grid=grid(8028160), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_29.benchmark_all_configs(*args, 8028160, grid=grid(8028160))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/e3/ce3edr4giy7ygyubb445dsh6wavenlh7yhdxr5odchwzb5blm45m.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => convert_element_type_21, var_mean_5
triton_red_fused__native_batch_norm_legit_functional_30 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[1024, 32768],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_30(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 520
    rnumel = 30878
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x1 = (xindex // 40)
    x0 = xindex % 40
    _tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = r2 + (30878*x1)
        tmp1 = 401408
        tmp2 = tmp0 < tmp1
        tmp3 = tl.load(in_ptr0 + ((3136*x0) + (125440*(((r2 + (30878*x1)) // 3136) % 128)) + ((r2 + (30878*x1)) % 3136) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp4 = tmp3.to(tl.float32)
        tmp5 = tl.where(tmp2, tmp4, 0)
        _tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6)
    tmp6 = tl.sum(_tmp6, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp6, xmask)


def get_args():
    arg_0 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 40, 1, 1, 13), (520, 1, 520, 520, 40), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_30.run(*args, 520, 30878, grid=grid(520), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_30.benchmark_all_configs(*args, 520, 30878, grid=grid(520))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/5l/c5ldagjb25qlt4w6hnm2s5f7x7ddcw2evdosdg3pvhtpoduxelhs.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_28, convert_element_type_21, mul_36, mul_37, var_mean_5
triton_per_fused__native_batch_norm_legit_functional_31 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[64, 16],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_31(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 40
    rnumel = 13
    RBLOCK: tl.constexpr = 16
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (40*r1)), rmask & xmask, other=0)
    tmp8 = tl.load(in_ptr1 + (x0), xmask)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = 401408.0
    tmp5 = tmp3 / tmp4
    tmp6 = 0.1
    tmp7 = tmp5 * tmp6
    tmp9 = 0.9
    tmp10 = tmp8 * tmp9
    tmp11 = tmp7 + tmp10
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask)


def get_args():
    arg_0 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((1, 40, 1, 1, 13), (520, 1, 520, 520, 40), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_31.run(*args, 40, 13, grid=grid(40), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_31.benchmark_all_configs(*args, 40, 13, grid=grid(40))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ko/ckomyuegfrpkii72qwrquhfy6eh53az7jeqam3ghl4ajcmcw2h3l.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => convert_element_type_21, var_mean_5
triton_red_fused__native_batch_norm_legit_functional_32 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[1024, 32768],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_32(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 520
    rnumel = 30878
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x1 = (xindex // 40)
    x0 = xindex % 40
    _tmp9 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = r2 + (30878*x1)
        tmp1 = 401408
        tmp2 = tmp0 < tmp1
        tmp3 = tl.load(in_ptr0 + ((3136*x0) + (125440*(((r2 + (30878*x1)) // 3136) % 128)) + ((r2 + (30878*x1)) % 3136) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp4 = tmp3.to(tl.float32)
        tmp5 = tl.load(in_ptr1 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0)
        tmp6 = tmp4 - tmp5
        tmp7 = tmp6 * tmp6
        tmp8 = tl.where(tmp2, tmp7, 0)
        _tmp9 = tl.where(rmask & xmask, _tmp9 + tmp8, _tmp9)
    tmp9 = tl.sum(_tmp9, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp9, xmask)


def get_args():
    arg_0 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 40, 1, 1, 13), (520, 1, 520, 520, 40), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_32.run(*args, 520, 30878, grid=grid(520), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_32.benchmark_all_configs(*args, 520, 30878, grid=grid(520))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/5p/c5pcmnlklk6a6ee2vm3ta2iounjyiftuynwfzbts7sayeuto4jvn.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_27, add_29, convert_element_type_21, mul_38, mul_39, mul_40, rsqrt_5, squeeze_16, var_mean_5
triton_per_fused__native_batch_norm_legit_functional_33 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[64, 16],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_33(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 40
    rnumel = 13
    RBLOCK: tl.constexpr = 16
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (40*r1)), rmask & xmask, other=0)
    tmp13 = tl.load(in_ptr1 + (x0), xmask)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = 401408.0
    tmp5 = tmp3 / tmp4
    tmp6 = 1e-05
    tmp7 = tmp5 + tmp6
    tmp8 = tl.math.rsqrt(tmp7)
    tmp9 = 1.0000024912370735
    tmp10 = tmp5 * tmp9
    tmp11 = 0.1
    tmp12 = tmp10 * tmp11
    tmp14 = 0.9
    tmp15 = tmp13 * tmp14
    tmp16 = tmp12 + tmp15
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask)
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask)
    tl.store(out_ptr0 + x0, tmp3, xmask)


def get_args():
    arg_0 = rand_strided((1, 40, 1, 1, 13), (520, 1, 520, 520, 40), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_33.run(*args, 40, 13, grid=grid(40), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_33.benchmark_all_configs(*args, 40, 13, grid=grid(40))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/pf/cpfbdly4ax5zztymk42pnezqf7ys2im4sb437kjrqjpf4x2jbotn.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_27, add_30, convert_element_type_21, convert_element_type_22, mul_35, mul_41, rsqrt_5, sub_5, var_mean_5
triton_poi_fused__native_batch_norm_legit_functional_34 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_34(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 16056320
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 3136) % 40
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x1), None)
    tmp4 = tl.load(in_ptr2 + (x1), None)
    tmp11 = tl.load(in_ptr3 + (x1), None)
    tmp13 = tl.load(in_ptr4 + (x1), None)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 401408.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None)


def get_args():
    arg_0 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_34.run(*args, 16056320, grid=grid(16056320), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_34.benchmark_all_configs(*args, 16056320, grid=grid(16056320))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/jt/cjtdu5nq2b37l5fyi5c7vdk3m4aqhqeqbsw3e5ifo4u74lo5bgaq.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_23
triton_poi_fused__to_copy_35 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_35(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1200
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((60, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((60, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_35.run(*args, 1200, grid=grid(1200), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_35.benchmark_all_configs(*args, 1200, grid=grid(1200))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/wj/cwjg3r37ehsowju7qerrsqlothtrltxyyuq5mhnulbp7zor3ocw5.py
# Original ATen: aten.cat

# aten.cat => cat_3
triton_poi_fused_cat_36 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_36(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 24084480
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 188160
    x1 = (xindex // 188160)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (376320*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 60, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_36.run(*args, 24084480, grid=grid(24084480), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_36.benchmark_all_configs(*args, 24084480, grid=grid(24084480))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/2c/c2cmgfhoarotsw2klaplncsqdxdxrbc3je4bm4idg4672cm2hbzh.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => convert_element_type_25, var_mean_6
triton_red_fused__native_batch_norm_legit_functional_37 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[512, 131072],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_37(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 480
    rnumel = 100352
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex % 120
    x1 = (xindex // 120)
    _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = tl.load(in_ptr0 + ((3136*x0) + (376320*(r2 // 3136)) + (12042240*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
    tmp2 = tl.sum(_tmp2, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp2, xmask)


def get_args():
    arg_0 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 120, 1, 1, 4), (480, 1, 480, 480, 120), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_37.run(*args, 480, 100352, grid=grid(480), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_37.benchmark_all_configs(*args, 480, 100352, grid=grid(480))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/xq/cxq23fscob3jbjdcsao7sicyvado5nsvykscbw7syq3dh73swjgu.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_33, convert_element_type_25, mul_43, mul_44, var_mean_6
triton_per_fused__native_batch_norm_legit_functional_38 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[128, 4],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_38(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 120
    rnumel = 4
    RBLOCK: tl.constexpr = 4
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (120*r1)), rmask & xmask, other=0)
    tmp8 = tl.load(in_ptr1 + (x0), xmask)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = 401408.0
    tmp5 = tmp3 / tmp4
    tmp6 = 0.1
    tmp7 = tmp5 * tmp6
    tmp9 = 0.9
    tmp10 = tmp8 * tmp9
    tmp11 = tmp7 + tmp10
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask)


def get_args():
    arg_0 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((1, 120, 1, 1, 4), (480, 1, 480, 480, 120), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_38.run(*args, 120, 4, grid=grid(120), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_38.benchmark_all_configs(*args, 120, 4, grid=grid(120))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/wg/cwgir4s67j2t35f66rtqguw4skzrnzaazh3rvukzvjln6bwfquwz.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => convert_element_type_25, var_mean_6
triton_red_fused__native_batch_norm_legit_functional_39 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[512, 131072],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_39(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 480
    rnumel = 100352
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex % 120
    x1 = (xindex // 120)
    tmp2 = tl.load(in_ptr1 + (x0), xmask)
    _tmp5 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = tl.load(in_ptr0 + ((3136*x0) + (376320*(r2 // 3136)) + (12042240*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        tmp3 = tmp1 - tmp2
        tmp4 = tmp3 * tmp3
        _tmp5 = tl.where(rmask & xmask, _tmp5 + tmp4, _tmp5)
    tmp5 = tl.sum(_tmp5, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp5, xmask)


def get_args():
    arg_0 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 120, 1, 1, 4), (480, 1, 480, 480, 120), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_39.run(*args, 480, 100352, grid=grid(480), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_39.benchmark_all_configs(*args, 480, 100352, grid=grid(480))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/bd/cbdjjrt2eltvde4ay5q7ftrif7dzdrcgxsonoef3wv6a6n446fg5.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_32, add_34, convert_element_type_25, mul_45, mul_46, mul_47, rsqrt_6, squeeze_19, var_mean_6
triton_per_fused__native_batch_norm_legit_functional_40 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[128, 4],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_40(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 120
    rnumel = 4
    RBLOCK: tl.constexpr = 4
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (120*r1)), rmask & xmask, other=0)
    tmp13 = tl.load(in_ptr1 + (x0), xmask)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = 401408.0
    tmp5 = tmp3 / tmp4
    tmp6 = 1e-05
    tmp7 = tmp5 + tmp6
    tmp8 = tl.math.rsqrt(tmp7)
    tmp9 = 1.0000024912370735
    tmp10 = tmp5 * tmp9
    tmp11 = 0.1
    tmp12 = tmp10 * tmp11
    tmp14 = 0.9
    tmp15 = tmp13 * tmp14
    tmp16 = tmp12 + tmp15
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask)
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask)
    tl.store(out_ptr0 + x0, tmp3, xmask)


def get_args():
    arg_0 = rand_strided((1, 120, 1, 1, 4), (480, 1, 480, 480, 120), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_40.run(*args, 120, 4, grid=grid(120), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_40.benchmark_all_configs(*args, 120, 4, grid=grid(120))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ff/cffbiktpj5b36houxciovjfe3g7ji5w5ibbvgjpjdr6m6dxnqiod.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.relu

# aten._native_batch_norm_legit_functional => add_32, add_35, convert_element_type_25, convert_element_type_26, mul_42, mul_48, rsqrt_6, sub_6, var_mean_6
# aten.relu => relu_4
triton_poi_fused__native_batch_norm_legit_functional_relu_41 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_relu_41(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 48168960
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 3136) % 120
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x1), None)
    tmp4 = tl.load(in_ptr2 + (x1), None)
    tmp11 = tl.load(in_ptr3 + (x1), None)
    tmp13 = tl.load(in_ptr4 + (x1), None)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 401408.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tmp16 = tl.where(0 != 0, 0, tl.where(0 > tmp15, 0, tmp15))
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp16, None)


def get_args():
    arg_0 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_relu_41.run(*args, 48168960, grid=grid(48168960), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_relu_41.benchmark_all_configs(*args, 48168960, grid=grid(48168960))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/6y/c6ynio5pu6llppeohanybeuevixt5rgpdgxw24ylgggmbgzjacrt.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_27
triton_poi_fused__to_copy_42 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_42(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1080
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_42.run(*args, 1080, grid=grid(1080), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_42.benchmark_all_configs(*args, 1080, grid=grid(1080))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/p3/cp3ysrijswutvs72wkv4ypuaradvydgclkx6swvsdiolmqtydjck.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.relu, aten.threshold_backward

# aten._native_batch_norm_legit_functional => add_37, add_40, convert_element_type_28, convert_element_type_29, mul_49, mul_55, rsqrt_7, sub_7, var_mean_7
# aten.relu => relu_5
# aten.threshold_backward => le_1
triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_43 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*i1', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_43(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
    xnumel = 48168960
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 3136) % 120
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x1), None)
    tmp4 = tl.load(in_ptr2 + (x1), None)
    tmp11 = tl.load(in_ptr3 + (x1), None)
    tmp13 = tl.load(in_ptr4 + (x1), None)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 401408.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tmp16 = tl.where(0 != 0, 0, tl.where(0 > tmp15, 0, tmp15))
    tmp17 = 0.0
    tmp18 = tmp16 <= tmp17
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp16, None)
    tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp18, None)


def get_args():
    arg_0 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_6 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.bool)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_43.run(*args, 48168960, grid=grid(48168960), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_43.benchmark_all_configs(*args, 48168960, grid=grid(48168960))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/pe/cpea2gtzlc2kbhqh3fj4j3lqcnuxq36vpr46nhhxpvtecppidlkn.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_30
triton_poi_fused__to_copy_44 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_44(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1200
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((20, 60, 1, 1), (60, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((20, 60, 1, 1), (60, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_44.run(*args, 1200, grid=grid(1200), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_44.benchmark_all_configs(*args, 1200, grid=grid(1200))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/nm/cnmhyqb22hbicwkw553bytawf6yat3hnc5u573hgyyuw5vi2ohc2.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add

# aten._native_batch_norm_legit_functional => add_42, add_45, convert_element_type_32, convert_element_type_33, mul_56, mul_62, rsqrt_8, sub_8, var_mean_8
# aten.add => add_46
triton_poi_fused__native_batch_norm_legit_functional_add_45 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_45(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 16056320
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 3136) % 40
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x1), None)
    tmp4 = tl.load(in_ptr2 + (x1), None)
    tmp11 = tl.load(in_ptr3 + (x1), None)
    tmp13 = tl.load(in_ptr4 + (x1), None)
    tmp16 = tl.load(in_ptr5 + (x3), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 401408.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tmp17 = tmp15 + tmp16
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp17, None)


def get_args():
    arg_0 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_6 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_45.run(*args, 16056320, grid=grid(16056320), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_45.benchmark_all_configs(*args, 16056320, grid=grid(16056320))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/d6/cd6uk6iyi5u7h7amewzycxkyxriny4cknptmukvtlpxi4jwlerzf.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_34
triton_poi_fused__to_copy_46 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_46(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 9600
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((240, 40, 1, 1), (40, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((240, 40, 1, 1), (40, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_46.run(*args, 9600, grid=grid(9600), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_46.benchmark_all_configs(*args, 9600, grid=grid(9600))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/p3/cp3dwfqdvlpfdvfvxrfm2lik34mpytuijuqsswm7efzikdczro7d.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_48, add_49, add_50, convert_element_type_35, mul_64, mul_65, mul_66, mul_67, mul_68, rsqrt_9, squeeze_28, var_mean_9
triton_red_fused__native_batch_norm_legit_functional_47 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[256, 524288],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_47(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 240
    rnumel = 401408
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 3136
        r2 = (rindex // 3136)
        tmp0 = tl.load(in_ptr0 + (r1 + (3136*x0) + (752640*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
    tmp2 = tl.sum(_tmp2, 1)[:, None]
    tmp7 = tl.load(in_ptr1 + (x0), xmask)
    tmp3 = 401408.0
    tmp4 = tmp2 / tmp3
    tmp5 = 0.1
    tmp6 = tmp4 * tmp5
    tmp8 = 0.9
    tmp9 = tmp7 * tmp8
    tmp10 = tmp6 + tmp9
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
    _tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 3136
        r2 = (rindex // 3136)
        tmp11 = tl.load(in_ptr0 + (r1 + (3136*x0) + (752640*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp12 = tmp11.to(tl.float32)
        tmp13 = tmp12 - tmp4
        tmp14 = tmp13 * tmp13
        _tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
    tmp15 = tl.sum(_tmp15, 1)[:, None]
    tl.store(out_ptr1 + x0, tmp15, xmask)
    tmp25 = tl.load(in_ptr2 + (x0), xmask)
    tmp16 = 401408.0
    tmp17 = tmp15 / tmp16
    tmp18 = 1e-05
    tmp19 = tmp17 + tmp18
    tmp20 = tl.math.rsqrt(tmp19)
    tmp21 = 1.0000024912370735
    tmp22 = tmp17 * tmp21
    tmp23 = 0.1
    tmp24 = tmp22 * tmp23
    tmp26 = 0.9
    tmp27 = tmp25 * tmp26
    tmp28 = tmp24 + tmp27
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
    tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)


def get_args():
    arg_0 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_47.run(*args, 240, 401408, grid=grid(240), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_47.benchmark_all_configs(*args, 240, 401408, grid=grid(240))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/bn/cbnwrcxmnryz6qd2vzr5qnxb2q3vepfvhupqz2awsu4zkworteie.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub

# aten._native_batch_norm_legit_functional => add_48, add_51, convert_element_type_35, convert_element_type_36, mul_63, mul_69, rsqrt_9, sub_9, var_mean_9
# aten.add => add_379
# aten.clone => clone
# aten.fill => full_like_47
# aten.mul => mul_1107, mul_1108
# aten.sigmoid => sigmoid_111
# aten.sub => sub_313
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_48 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[134217728], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_48(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
    xnumel = 96337920
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 3136) % 240
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x1), None)
    tmp4 = tl.load(in_ptr2 + (x1), None)
    tmp11 = tl.load(in_ptr3 + (x1), None)
    tmp13 = tl.load(in_ptr4 + (x1), None)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 401408.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tmp16 = tl.sigmoid(tmp15)
    tmp17 = 1.0
    tmp18 = tmp17 - tmp16
    tmp19 = tmp15 * tmp18
    tmp20 = tmp19 + tmp17
    tmp21 = tmp16 * tmp20
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None)
    tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp21, None)


def get_args():
    arg_0 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_6 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_48.run(*args, 96337920, grid=grid(96337920), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_48.benchmark_all_configs(*args, 96337920, grid=grid(96337920))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/tr/ctrlm7wxwgyqu6igydzh2qy3dgfyvkbxyf6ed5dmhs5ac43nkywg.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_39
triton_poi_fused__to_copy_49 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_49(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 540
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((60, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((60, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_49.run(*args, 540, grid=grid(540), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_49.benchmark_all_configs(*args, 540, grid=grid(540))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/62/c62vp5bgz7fqdu2x3oxuvrnwwvzztezlji4zzzxqegy77wuqckqo.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_52
triton_poi_fused_split_with_sizes_50 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_50(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 24084480
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 188160
    x1 = (xindex // 188160)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (752640*x1)), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)


def get_args():
    arg_0 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_50.run(*args, 24084480, grid=grid(24084480), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_50.benchmark_all_configs(*args, 24084480, grid=grid(24084480))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/5k/c5k2puyq6twbirobmys3yfadgomf23ufkkjsshzn37upfh5g7erl.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_40
triton_poi_fused__to_copy_51 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_51(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1500
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((60, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((60, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_51.run(*args, 1500, grid=grid(1500), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_51.benchmark_all_configs(*args, 1500, grid=grid(1500))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/qq/cqqmootvvozbchj4wyxl2cmg4iebq6hs2v2baytcez4tdix7pywu.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_57
triton_poi_fused_split_with_sizes_52 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_52(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 24084480
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 188160
    x1 = (xindex // 188160)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (188160 + x0 + (752640*x1)), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)


def get_args():
    arg_0 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_52.run(*args, 24084480, grid=grid(24084480), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_52.benchmark_all_configs(*args, 24084480, grid=grid(24084480))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/hn/chn7fi5d6pb7lt74jnnjikqswlxeu3nbv3mpu3aidnxkwdaw4fmq.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_41
triton_poi_fused__to_copy_53 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_53(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2940
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((60, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((60, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_53.run(*args, 2940, grid=grid(2940), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_53.benchmark_all_configs(*args, 2940, grid=grid(2940))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/7d/c7d6lruapdg7jujcavuay77hawcbzn2rk3uqz6opu54tj6rxd3dc.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_62
triton_poi_fused_split_with_sizes_54 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_54(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 24084480
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 188160
    x1 = (xindex // 188160)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (376320 + x0 + (752640*x1)), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)


def get_args():
    arg_0 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_54.run(*args, 24084480, grid=grid(24084480), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_54.benchmark_all_configs(*args, 24084480, grid=grid(24084480))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ih/cihdtphelycdnabls3bvxcjqdwyigw5444zlobgvmbwdwzxq7zdy.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_42
triton_poi_fused__to_copy_55 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_55(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 4860
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((60, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((60, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_55.run(*args, 4860, grid=grid(4860), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_55.benchmark_all_configs(*args, 4860, grid=grid(4860))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/oq/coqx7tfdhgzsd3kzw3ueczpktm7gxo5z67ciilgh24ffay7herao.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_67
triton_poi_fused_split_with_sizes_56 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_56(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 24084480
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 188160
    x1 = (xindex // 188160)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (564480 + x0 + (752640*x1)), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)


def get_args():
    arg_0 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_56.run(*args, 24084480, grid=grid(24084480), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_56.benchmark_all_configs(*args, 24084480, grid=grid(24084480))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/n6/cn6lwm2ninzuud2fuu3omrklbeewaxqtkv3v6gdaxmikjvyoi525.py
# Original ATen: aten.cat

# aten.cat => cat_5
triton_poi_fused_cat_57 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_57(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 6021120
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 47040
    x1 = (xindex // 47040)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (188160*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 60, 28, 28), (47040, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 60, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_57.run(*args, 6021120, grid=grid(6021120), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_57.benchmark_all_configs(*args, 6021120, grid=grid(6021120))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/tm/ctmfxwnpwpks3pe3wok54jftmfykafzupq5buoissf32gfcrrq6d.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_53, add_54, add_55, convert_element_type_43, mul_72, mul_73, mul_74, mul_75, mul_76, rsqrt_10, squeeze_31, var_mean_10
triton_red_fused__native_batch_norm_legit_functional_58 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[256, 131072],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_58(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 240
    rnumel = 100352
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 784
        r2 = (rindex // 784)
        tmp0 = tl.load(in_ptr0 + (r1 + (784*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
    tmp2 = tl.sum(_tmp2, 1)[:, None]
    tmp7 = tl.load(in_ptr1 + (x0), xmask)
    tmp3 = 100352.0
    tmp4 = tmp2 / tmp3
    tmp5 = 0.1
    tmp6 = tmp4 * tmp5
    tmp8 = 0.9
    tmp9 = tmp7 * tmp8
    tmp10 = tmp6 + tmp9
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
    _tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 784
        r2 = (rindex // 784)
        tmp11 = tl.load(in_ptr0 + (r1 + (784*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp12 = tmp11.to(tl.float32)
        tmp13 = tmp12 - tmp4
        tmp14 = tmp13 * tmp13
        _tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
    tmp15 = tl.sum(_tmp15, 1)[:, None]
    tl.store(out_ptr1 + x0, tmp15, xmask)
    tmp25 = tl.load(in_ptr2 + (x0), xmask)
    tmp16 = 100352.0
    tmp17 = tmp15 / tmp16
    tmp18 = 1e-05
    tmp19 = tmp17 + tmp18
    tmp20 = tl.math.rsqrt(tmp19)
    tmp21 = 1.00000996502277
    tmp22 = tmp17 * tmp21
    tmp23 = 0.1
    tmp24 = tmp22 * tmp23
    tmp26 = 0.9
    tmp27 = tmp25 * tmp26
    tmp28 = tmp24 + tmp27
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
    tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)


def get_args():
    arg_0 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_58.run(*args, 240, 100352, grid=grid(240), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_58.benchmark_all_configs(*args, 240, 100352, grid=grid(240))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/tf/ctf52h6qjyfnbwd7tzysnt6o3dvwaedcis63siipfnbkcfkukqa7.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu

# aten._native_batch_norm_legit_functional => add_53, add_56, convert_element_type_43, convert_element_type_44, mul_71, mul_77, rsqrt_10, sub_10, var_mean_10
# aten.mean => mean
# aten.silu => convert_element_type_45, convert_element_type_46, mul_78, sigmoid_1
triton_per_fused__native_batch_norm_legit_functional_mean_silu_59 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[32768, 1024],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_mean_silu_59(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 30720
    rnumel = 784
    RBLOCK: tl.constexpr = 1024
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r2 = rindex
    x3 = xindex
    x0 = xindex % 240
    tmp0 = tl.load(in_ptr0 + (r2 + (784*x3)), rmask, other=0).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x0), None)
    tmp4 = tl.load(in_ptr2 + (x0), None)
    tmp11 = tl.load(in_ptr3 + (x0), None)
    tmp13 = tl.load(in_ptr4 + (x0), None)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 100352.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tmp16 = tmp15.to(tl.float32)
    tmp17 = tl.sigmoid(tmp16)
    tmp18 = tmp16 * tmp17
    tmp19 = tmp18.to(tl.float32)
    tmp20 = tmp19.to(tl.float32)
    tmp22 = tl.where(rmask, tmp20, 0)
    tmp23 = tl.sum(tmp22, 1)[:, None]
    tmp24 = 784.0
    tmp25 = tmp23 / tmp24
    tmp26 = tmp25.to(tl.float32)
    tl.store(out_ptr0 + (r2 + (784*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask)
    tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None)


def get_args():
    arg_0 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_6 = rand_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_mean_silu_59.run(*args, 30720, 784, grid=grid(30720), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_mean_silu_59.benchmark_all_configs(*args, 30720, 784, grid=grid(30720))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/l4/cl43m5yr7mr6rofdxkde5hoinowoes7zmspib6uw4wfhtoztyd2j.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_48
triton_poi_fused__to_copy_60 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_60(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 4800
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((20, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((20, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_60.run(*args, 4800, grid=grid(4800), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_60.benchmark_all_configs(*args, 4800, grid=grid(4800))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/n3/cn3yrrvjz2rkdvv5cbgdyj5mk5wgjpljfxyga75evi5tqahwagdd.py
# Original ATen: aten._to_copy, aten.convolution

# aten._to_copy => convert_element_type_47
# aten.convolution => convolution_20
triton_poi_fused__to_copy_convolution_61 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[32], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_61(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 20
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((20,), (1,), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((20,), (1,), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_convolution_61.run(*args, 20, grid=grid(20), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_convolution_61.benchmark_all_configs(*args, 20, grid=grid(20))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/nz/cnz5cjjblhni4cibgggtjl3kn4ksihmqz6znyn5belgi4q43cncw.py
# Original ATen: aten._to_copy, aten.convolution, aten.silu

# aten._to_copy => convert_element_type_47
# aten.convolution => convolution_20
# aten.silu => convert_element_type_49, convert_element_type_50, mul_79, sigmoid_2
triton_poi_fused__to_copy_convolution_silu_62 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_silu_62(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2560
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 20
    tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32)
    tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp2 = tmp0 + tmp1
    tmp3 = tmp2.to(tl.float32)
    tmp4 = tl.sigmoid(tmp3)
    tmp5 = tmp3 * tmp4
    tmp6 = tmp5.to(tl.float32)
    tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, xmask)


def get_args():
    arg_0 = rand_strided((128, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((20,), (1,), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_convolution_silu_62.run(*args, 2560, grid=grid(2560), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_convolution_silu_62.benchmark_all_configs(*args, 2560, grid=grid(2560))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/bl/cblr4zz2jdn5lx7fyq6hto375tcressl73jxrxvozhqbubkoilua.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_52
triton_poi_fused__to_copy_63 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_63(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 4800
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((240, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((240, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_63.run(*args, 4800, grid=grid(4800), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_63.benchmark_all_configs(*args, 4800, grid=grid(4800))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/gq/cgqatezdistnqv4yyabd7t76wkyduditccoaqjd3ef4npoxk6vrv.py
# Original ATen: aten._to_copy, aten.convolution

# aten._to_copy => convert_element_type_51
# aten.convolution => convolution_21
triton_poi_fused__to_copy_convolution_64 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[256], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_64(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 240
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_convolution_64.run(*args, 240, grid=grid(240), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_convolution_64.benchmark_all_configs(*args, 240, grid=grid(240))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/qr/cqrlwq46vnra4itz3o4m3cvwrnyl7srsa7xlwywcqzni2rafclhf.py
# Original ATen: aten._to_copy, aten.convolution

# aten._to_copy => convert_element_type_51
# aten.convolution => convolution_21
triton_poi_fused__to_copy_convolution_65 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_65(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 30720
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 240
    tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)
    tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
    tmp2 = tmp0 + tmp1
    tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None)


def get_args():
    arg_0 = rand_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_convolution_65.run(*args, 30720, grid=grid(30720), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_convolution_65.benchmark_all_configs(*args, 30720, grid=grid(30720))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ru/crubekjujrhilkb2spkvdoiiitif5a5cisikyju6vjbn7j3itjum.py
# Original ATen: aten.mul, aten.sigmoid, aten.silu

# aten.mul => mul_80
# aten.sigmoid => sigmoid_3
# aten.silu => convert_element_type_45, convert_element_type_46, mul_78, sigmoid_1
triton_poi_fused_mul_sigmoid_silu_66 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused_mul_sigmoid_silu_66(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 24084480
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x1 = (xindex // 784)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tmp6 = tl.sigmoid(tmp5)
    tmp7 = tmp4 * tmp6
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None)


def get_args():
    arg_0 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_mul_sigmoid_silu_66.run(*args, 24084480, grid=grid(24084480), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_mul_sigmoid_silu_66.benchmark_all_configs(*args, 24084480, grid=grid(24084480))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/va/cvacncosevkh7a4e4mjytw5gvejhihlmvkwma42dwu2nfhypnbzr.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_53
triton_poi_fused__to_copy_67 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_67(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 13440
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((56, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((56, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_67.run(*args, 13440, grid=grid(13440), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_67.benchmark_all_configs(*args, 13440, grid=grid(13440))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/hv/chvkvxc6xaoyzdfyhbbbolrujmg4wuzt7gkh6x4thkny6u2hr4gg.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => convert_element_type_54, var_mean_11
triton_red_fused__native_batch_norm_legit_functional_68 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[1024, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_68(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 728
    rnumel = 7720
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x1 = (xindex // 56)
    x0 = xindex % 56
    _tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = r2 + (7720*x1)
        tmp1 = 100352
        tmp2 = tmp0 < tmp1
        tmp3 = tl.load(in_ptr0 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp4 = tmp3.to(tl.float32)
        tmp5 = tl.where(tmp2, tmp4, 0)
        _tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6)
    tmp6 = tl.sum(_tmp6, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp6, xmask)


def get_args():
    arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 56, 1, 1, 13), (728, 1, 728, 728, 56), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_68.run(*args, 728, 7720, grid=grid(728), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_68.benchmark_all_configs(*args, 728, 7720, grid=grid(728))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ih/cihgiw3fxfncw6wmlie2fnq4egrnxl6aq2aj37mt4r5kpoemk2yw.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_59, convert_element_type_54, mul_82, mul_83, var_mean_11
triton_per_fused__native_batch_norm_legit_functional_69 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[64, 16],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_69(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 56
    rnumel = 13
    RBLOCK: tl.constexpr = 16
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (56*r1)), rmask & xmask, other=0)
    tmp8 = tl.load(in_ptr1 + (x0), xmask)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = 100352.0
    tmp5 = tmp3 / tmp4
    tmp6 = 0.1
    tmp7 = tmp5 * tmp6
    tmp9 = 0.9
    tmp10 = tmp8 * tmp9
    tmp11 = tmp7 + tmp10
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask)


def get_args():
    arg_0 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((1, 56, 1, 1, 13), (728, 1, 728, 728, 56), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_69.run(*args, 56, 13, grid=grid(56), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_69.benchmark_all_configs(*args, 56, 13, grid=grid(56))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/n3/cn3m3v5dwzlicmheht7t2sqzrzgtbufmioizlp6yemcka6kewuok.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => convert_element_type_54, var_mean_11
triton_red_fused__native_batch_norm_legit_functional_70 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[1024, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_70(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 728
    rnumel = 7720
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x1 = (xindex // 56)
    x0 = xindex % 56
    _tmp9 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = r2 + (7720*x1)
        tmp1 = 100352
        tmp2 = tmp0 < tmp1
        tmp3 = tl.load(in_ptr0 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp4 = tmp3.to(tl.float32)
        tmp5 = tl.load(in_ptr1 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0)
        tmp6 = tmp4 - tmp5
        tmp7 = tmp6 * tmp6
        tmp8 = tl.where(tmp2, tmp7, 0)
        _tmp9 = tl.where(rmask & xmask, _tmp9 + tmp8, _tmp9)
    tmp9 = tl.sum(_tmp9, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp9, xmask)


def get_args():
    arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 56, 1, 1, 13), (728, 1, 728, 728, 56), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_70.run(*args, 728, 7720, grid=grid(728), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_70.benchmark_all_configs(*args, 728, 7720, grid=grid(728))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/xv/cxvq5i34k2bihobv4ebll2eevqva3mot7bow45ecoiyqxocbuesz.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_58, add_60, convert_element_type_54, mul_84, mul_85, mul_86, rsqrt_11, squeeze_34, var_mean_11
triton_per_fused__native_batch_norm_legit_functional_71 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[64, 16],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_71(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 56
    rnumel = 13
    RBLOCK: tl.constexpr = 16
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (56*r1)), rmask & xmask, other=0)
    tmp13 = tl.load(in_ptr1 + (x0), xmask)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = 100352.0
    tmp5 = tmp3 / tmp4
    tmp6 = 1e-05
    tmp7 = tmp5 + tmp6
    tmp8 = tl.math.rsqrt(tmp7)
    tmp9 = 1.00000996502277
    tmp10 = tmp5 * tmp9
    tmp11 = 0.1
    tmp12 = tmp10 * tmp11
    tmp14 = 0.9
    tmp15 = tmp13 * tmp14
    tmp16 = tmp12 + tmp15
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask)
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask)
    tl.store(out_ptr0 + x0, tmp3, xmask)


def get_args():
    arg_0 = rand_strided((1, 56, 1, 1, 13), (728, 1, 728, 728, 56), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_71.run(*args, 56, 13, grid=grid(56), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_71.benchmark_all_configs(*args, 56, 13, grid=grid(56))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/jw/cjwfycxx2mrgx2fsw6tfls7yyigmo7hmavgpkr2ob23aopbzsnjc.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_58, add_61, convert_element_type_54, convert_element_type_55, mul_81, mul_87, rsqrt_11, sub_11, var_mean_11
triton_poi_fused__native_batch_norm_legit_functional_72 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_72(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 5619712
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 784) % 56
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x1), None)
    tmp4 = tl.load(in_ptr2 + (x1), None)
    tmp11 = tl.load(in_ptr3 + (x1), None)
    tmp13 = tl.load(in_ptr4 + (x1), None)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 100352.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None)


def get_args():
    arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_72.run(*args, 5619712, grid=grid(5619712), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_72.benchmark_all_configs(*args, 5619712, grid=grid(5619712))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/7e/c7eyndtglwpd6eg2g74cubgx2m45rn7sikd7mb4t4ucitntos24j.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_56
triton_poi_fused__to_copy_73 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_73(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 4704
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_73.run(*args, 4704, grid=grid(4704), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_73.benchmark_all_configs(*args, 4704, grid=grid(4704))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/g7/cg7zj3gxcfnefxp4hvh24f2ahem4qdf4cwmaspxgokny23kr7rdc.py
# Original ATen: aten.cat

# aten.cat => cat_6
triton_poi_fused_cat_74 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_74(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 16859136
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 131712
    x1 = (xindex // 131712)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (263424*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 168, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_74.run(*args, 16859136, grid=grid(16859136), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_74.benchmark_all_configs(*args, 16859136, grid=grid(16859136))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ea/cea2p3bterhsxcizlwjuqgqdcdagptpjutjfoisqqtcxx4bo46r2.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_63, add_64, add_65, convert_element_type_58, mul_89, mul_90, mul_91, mul_92, mul_93, rsqrt_12, squeeze_37, var_mean_12
triton_red_fused__native_batch_norm_legit_functional_75 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[512, 131072],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_75(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 336
    rnumel = 100352
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 784
        r2 = (rindex // 784)
        tmp0 = tl.load(in_ptr0 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
    tmp2 = tl.sum(_tmp2, 1)[:, None]
    tmp7 = tl.load(in_ptr1 + (x0), xmask)
    tmp3 = 100352.0
    tmp4 = tmp2 / tmp3
    tmp5 = 0.1
    tmp6 = tmp4 * tmp5
    tmp8 = 0.9
    tmp9 = tmp7 * tmp8
    tmp10 = tmp6 + tmp9
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
    _tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 784
        r2 = (rindex // 784)
        tmp11 = tl.load(in_ptr0 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp12 = tmp11.to(tl.float32)
        tmp13 = tmp12 - tmp4
        tmp14 = tmp13 * tmp13
        _tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
    tmp15 = tl.sum(_tmp15, 1)[:, None]
    tl.store(out_ptr1 + x0, tmp15, xmask)
    tmp25 = tl.load(in_ptr2 + (x0), xmask)
    tmp16 = 100352.0
    tmp17 = tmp15 / tmp16
    tmp18 = 1e-05
    tmp19 = tmp17 + tmp18
    tmp20 = tl.math.rsqrt(tmp19)
    tmp21 = 1.00000996502277
    tmp22 = tmp17 * tmp21
    tmp23 = 0.1
    tmp24 = tmp22 * tmp23
    tmp26 = 0.9
    tmp27 = tmp25 * tmp26
    tmp28 = tmp24 + tmp27
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
    tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)


def get_args():
    arg_0 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_75.run(*args, 336, 100352, grid=grid(336), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_75.benchmark_all_configs(*args, 336, 100352, grid=grid(336))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/3o/c3odnwpkqtnjm6osw5x7eu5soiuoiv27n2m7cqobgwum2f4c5rgk.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub

# aten._native_batch_norm_legit_functional => add_63, add_66, convert_element_type_58, convert_element_type_59, mul_88, mul_94, rsqrt_12, sub_12, var_mean_12
# aten.add => add_374
# aten.clone => clone_3
# aten.fill => full_like_44
# aten.mul => mul_1067, mul_1068
# aten.sigmoid => sigmoid_108
# aten.sub => sub_297
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
    xnumel = 33718272
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 784) % 336
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x1), None)
    tmp4 = tl.load(in_ptr2 + (x1), None)
    tmp11 = tl.load(in_ptr3 + (x1), None)
    tmp13 = tl.load(in_ptr4 + (x1), None)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 100352.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tmp16 = tl.sigmoid(tmp15)
    tmp17 = 1.0
    tmp18 = tmp17 - tmp16
    tmp19 = tmp15 * tmp18
    tmp20 = tmp19 + tmp17
    tmp21 = tmp16 * tmp20
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None)
    tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp21, None)


def get_args():
    arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_6 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76.run(*args, 33718272, grid=grid(33718272), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76.benchmark_all_configs(*args, 33718272, grid=grid(33718272))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/nw/cnwbbjysyivhwwej3cvtzavdj6jvinxuti5ndhcg3qzuf77glcrt.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_62
triton_poi_fused__to_copy_77 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_77(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1512
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((168, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((168, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_77.run(*args, 1512, grid=grid(1512), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_77.benchmark_all_configs(*args, 1512, grid=grid(1512))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/fz/cfzmnx3o626lf5s63xg32x6is4tmygw4nxdhdigpofuif52vxrzm.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_78
triton_poi_fused_split_with_sizes_78 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_78(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 16859136
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 131712
    x1 = (xindex // 131712)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (263424*x1)), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)


def get_args():
    arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_78.run(*args, 16859136, grid=grid(16859136), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_78.benchmark_all_configs(*args, 16859136, grid=grid(16859136))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/2r/c2rv7mzu7my2opxvcm4z3utkh7llovacnk6pwuzns4ehc3sgey2e.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_63
triton_poi_fused__to_copy_79 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_79(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 4200
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((168, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((168, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_79.run(*args, 4200, grid=grid(4200), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_79.benchmark_all_configs(*args, 4200, grid=grid(4200))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/pt/cptqmg7k6n2us7i5wywz2bwg6fjwqyaknishdkved5rkdpml5nw5.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_81
triton_poi_fused_split_with_sizes_80 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_80(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 16859136
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 131712
    x1 = (xindex // 131712)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (131712 + x0 + (263424*x1)), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)


def get_args():
    arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_80.run(*args, 16859136, grid=grid(16859136), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_80.benchmark_all_configs(*args, 16859136, grid=grid(16859136))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/jj/cjj6wuzjfwt7xigucunb4hdetjt4mr4ldxzq5ab4kxddbetbjdu4.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu

# aten._native_batch_norm_legit_functional => add_68, add_71, convert_element_type_64, convert_element_type_65, mul_102, mul_96, rsqrt_13, sub_13, var_mean_13
# aten.mean => mean_1
# aten.silu => convert_element_type_66, convert_element_type_67, mul_103, sigmoid_5
triton_per_fused__native_batch_norm_legit_functional_mean_silu_81 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[65536, 1024],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_mean_silu_81(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 43008
    rnumel = 784
    RBLOCK: tl.constexpr = 1024
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r2 = rindex
    x3 = xindex
    x0 = xindex % 336
    tmp0 = tl.load(in_ptr0 + (r2 + (784*x3)), rmask, other=0).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x0), None)
    tmp4 = tl.load(in_ptr2 + (x0), None)
    tmp11 = tl.load(in_ptr3 + (x0), None)
    tmp13 = tl.load(in_ptr4 + (x0), None)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 100352.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tmp16 = tmp15.to(tl.float32)
    tmp17 = tl.sigmoid(tmp16)
    tmp18 = tmp16 * tmp17
    tmp19 = tmp18.to(tl.float32)
    tmp20 = tmp19.to(tl.float32)
    tmp22 = tl.where(rmask, tmp20, 0)
    tmp23 = tl.sum(tmp22, 1)[:, None]
    tmp24 = 784.0
    tmp25 = tmp23 / tmp24
    tmp26 = tmp25.to(tl.float32)
    tl.store(out_ptr0 + (r2 + (784*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask)
    tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None)


def get_args():
    arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_6 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_mean_silu_81.run(*args, 43008, 784, grid=grid(43008), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_mean_silu_81.benchmark_all_configs(*args, 43008, 784, grid=grid(43008))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/qi/cqigsniw4k2557tsg6szy4sxc76yhclqkbczo3sje6z3gkbkxim2.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_69
triton_poi_fused__to_copy_82 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_82(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 9408
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((28, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((28, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_82.run(*args, 9408, grid=grid(9408), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_82.benchmark_all_configs(*args, 9408, grid=grid(9408))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/qr/cqrzuzcrjaui22653nognmjx7khvfhrb54plybvtrgjuggwhd7ca.py
# Original ATen: aten._to_copy, aten.convolution

# aten._to_copy => convert_element_type_68
# aten.convolution => convolution_27
triton_poi_fused__to_copy_convolution_83 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[32], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_83(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 28
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((28,), (1,), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((28,), (1,), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_convolution_83.run(*args, 28, grid=grid(28), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_convolution_83.benchmark_all_configs(*args, 28, grid=grid(28))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/5j/c5jenet4bcjhmp6nmplvispvr7rwhrt4fonprb4atbj2oqpb6yhc.py
# Original ATen: aten._to_copy, aten.convolution, aten.silu

# aten._to_copy => convert_element_type_68
# aten.convolution => convolution_27
# aten.silu => convert_element_type_70, convert_element_type_71, mul_104, sigmoid_6
triton_poi_fused__to_copy_convolution_silu_84 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_silu_84(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3584
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 28
    tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32)
    tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp2 = tmp0 + tmp1
    tmp3 = tmp2.to(tl.float32)
    tmp4 = tl.sigmoid(tmp3)
    tmp5 = tmp3 * tmp4
    tmp6 = tmp5.to(tl.float32)
    tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, xmask)


def get_args():
    arg_0 = rand_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((28,), (1,), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_convolution_silu_84.run(*args, 3584, grid=grid(3584), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_convolution_silu_84.benchmark_all_configs(*args, 3584, grid=grid(3584))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/6q/c6qndeyorjgk72ipb7s7xexoyu3w64lakoglh4m45doekpaj4mk5.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_73
triton_poi_fused__to_copy_85 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_85(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 9408
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((336, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((336, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_85.run(*args, 9408, grid=grid(9408), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_85.benchmark_all_configs(*args, 9408, grid=grid(9408))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/e4/ce4gppgjp566e4o7bpfsqwx6ehxfdqd5kcqjzpnjkl7lictvy34j.py
# Original ATen: aten._to_copy, aten.convolution

# aten._to_copy => convert_element_type_72
# aten.convolution => convolution_28
triton_poi_fused__to_copy_convolution_86 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[512], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_86(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 336
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_convolution_86.run(*args, 336, grid=grid(336), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_convolution_86.benchmark_all_configs(*args, 336, grid=grid(336))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/6n/c6nqtrrohcuy7rc3a6jzkibcrswmbwqy32tjb2t44xhm5lv3c7ca.py
# Original ATen: aten._to_copy, aten.convolution

# aten._to_copy => convert_element_type_72
# aten.convolution => convolution_28
triton_poi_fused__to_copy_convolution_87 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_87(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 43008
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 336
    tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)
    tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
    tmp2 = tmp0 + tmp1
    tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None)


def get_args():
    arg_0 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_convolution_87.run(*args, 43008, grid=grid(43008), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_convolution_87.benchmark_all_configs(*args, 43008, grid=grid(43008))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/tg/ctg6frnekhgugkbeugokpbetskj6nj7dyov4tmhfv6akngrmr3zz.py
# Original ATen: aten.mul, aten.sigmoid, aten.silu

# aten.mul => mul_105
# aten.sigmoid => sigmoid_7
# aten.silu => convert_element_type_66, convert_element_type_67, mul_103, sigmoid_5
triton_poi_fused_mul_sigmoid_silu_88 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused_mul_sigmoid_silu_88(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 33718272
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x1 = (xindex // 784)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tmp6 = tl.sigmoid(tmp5)
    tmp7 = tmp4 * tmp6
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None)


def get_args():
    arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_mul_sigmoid_silu_88.run(*args, 33718272, grid=grid(33718272), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_mul_sigmoid_silu_88.benchmark_all_configs(*args, 33718272, grid=grid(33718272))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/uj/cujd4wpttqmud7zdg64vlcs4ef2kzahlsvrue6dsbdgqyeobae26.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_84
triton_poi_fused_split_with_sizes_89 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_89(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 16859136
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 131712
    x1 = (xindex // 131712)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (263424*x1)), None).to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_89.run(*args, 16859136, grid=grid(16859136), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_89.benchmark_all_configs(*args, 16859136, grid=grid(16859136))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/f3/cf3fhfcwg6xxca6ynhdijefpawpp34qudotfwxc7l4ytcq3zjpuo.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_85
triton_poi_fused_split_with_sizes_90 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_90(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 16859136
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 131712
    x1 = (xindex // 131712)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (131712 + x0 + (263424*x1)), None).to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_90.run(*args, 16859136, grid=grid(16859136), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_90.benchmark_all_configs(*args, 16859136, grid=grid(16859136))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/mu/cmu55cboklppuar2bfoeffop2b46e55c3whe4q3ppvxcrz2secws.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_74
triton_poi_fused__to_copy_91 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_91(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 4704
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_91.run(*args, 4704, grid=grid(4704), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_91.benchmark_all_configs(*args, 4704, grid=grid(4704))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/p7/cp7uk2n6iplc4okgdyrcwjh6klja4hwa7hip53lwkxv2iafhns22.py
# Original ATen: aten.cat

# aten.cat => cat_8
triton_poi_fused_cat_92 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_92(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2809856
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 21952
    x1 = (xindex // 21952)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (43904*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 28, 28, 28), (21952, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 28, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_92.run(*args, 2809856, grid=grid(2809856), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_92.benchmark_all_configs(*args, 2809856, grid=grid(2809856))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/go/cgo47t53ri3q3acehvndov22nzujsisq6mopaip7tcxdzexe45pe.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add

# aten._native_batch_norm_legit_functional => add_73, add_76, convert_element_type_76, convert_element_type_77, mul_106, mul_112, rsqrt_14, sub_14, var_mean_14
# aten.add => add_77
triton_poi_fused__native_batch_norm_legit_functional_add_93 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_93(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 5619712
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 784) % 56
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x1), None)
    tmp4 = tl.load(in_ptr2 + (x1), None)
    tmp11 = tl.load(in_ptr3 + (x1), None)
    tmp13 = tl.load(in_ptr4 + (x1), None)
    tmp16 = tl.load(in_ptr5 + (x3), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 100352.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tmp17 = tmp15 + tmp16
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp17, None)


def get_args():
    arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_6 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_93.run(*args, 5619712, grid=grid(5619712), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_93.benchmark_all_configs(*args, 5619712, grid=grid(5619712))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/cm/ccmet4jdw3a22ehivtutado4ilowri7xe4wvz55tt3jrgkttrcxr.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_122
triton_poi_fused__to_copy_94 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_94(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 18816
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((336, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((336, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_94.run(*args, 18816, grid=grid(18816), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_94.benchmark_all_configs(*args, 18816, grid=grid(18816))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/2z/c2zspsupbpoyi6v25rww6qizjecactzymkuzhmgwbksaajrrvebj.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_127
triton_poi_fused__to_copy_95 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_95(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1008
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((112, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((112, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_95.run(*args, 1008, grid=grid(1008), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_95.benchmark_all_configs(*args, 1008, grid=grid(1008))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/vt/cvthhjpoxq63zz4mggqx2vqnhtco6vgp67jmycwwem5t4nuguc3d.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_125
triton_poi_fused_split_with_sizes_96 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_96(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 11239424
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 87808
    x1 = (xindex // 87808)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (263424*x1)), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)


def get_args():
    arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 112, 28, 28), (87808, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_96.run(*args, 11239424, grid=grid(11239424), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_96.benchmark_all_configs(*args, 11239424, grid=grid(11239424))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/67/c67f43otxwovlhoxcmgghbvuamebcise6w4hyt6kxev6rjigyz6t.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_128
triton_poi_fused__to_copy_97 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_97(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2800
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((112, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((112, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_97.run(*args, 2800, grid=grid(2800), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_97.benchmark_all_configs(*args, 2800, grid=grid(2800))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/mp/cmpup3qxitd5cnywi6sc5ctsik3jls3ax4u6bwwg7l22muv3kqac.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_129
triton_poi_fused_split_with_sizes_98 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_98(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 11239424
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 87808
    x1 = (xindex // 87808)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (87808 + x0 + (263424*x1)), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)


def get_args():
    arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 112, 28, 28), (87808, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_98.run(*args, 11239424, grid=grid(11239424), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_98.benchmark_all_configs(*args, 11239424, grid=grid(11239424))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/hj/chjntoqpyuayr43gwoduaxa3dcaerfocp4evoj27g5z63hwjt7yd.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_129
triton_poi_fused__to_copy_99 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_99(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 5488
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((112, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((112, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_99.run(*args, 5488, grid=grid(5488), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_99.benchmark_all_configs(*args, 5488, grid=grid(5488))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/5y/c5yntu5zez2ni5e2qpl6ynioug5zthjmijm3yvywigtt4blzf6l6.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_133
triton_poi_fused_split_with_sizes_100 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_100(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 11239424
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 87808
    x1 = (xindex // 87808)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (175616 + x0 + (263424*x1)), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)


def get_args():
    arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 112, 28, 28), (87808, 784, 28, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_100.run(*args, 11239424, grid=grid(11239424), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_100.benchmark_all_configs(*args, 11239424, grid=grid(11239424))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/hi/chiefsb3dple2qr2zd5ph5efd2rhpwu722ib7qhnwdnwgxusl6o7.py
# Original ATen: aten.cat

# aten.cat => cat_15
triton_poi_fused_cat_101 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_101(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2809856
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 21952
    x1 = (xindex // 21952)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (65856*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 112, 14, 14), (21952, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 112, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_101.run(*args, 2809856, grid=grid(2809856), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_101.benchmark_all_configs(*args, 2809856, grid=grid(2809856))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/hj/chj56tyq633bknygfo23skutbtotcilbxvxls3o3esvqkeosnqzu.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_116, add_117, add_118, convert_element_type_130, mul_172, mul_173, mul_174, mul_175, mul_176, rsqrt_22, squeeze_67, var_mean_22
triton_red_fused__native_batch_norm_legit_functional_102 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[512, 32768],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_102(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 336
    rnumel = 25088
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 196
        r2 = (rindex // 196)
        tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (65856*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
    tmp2 = tl.sum(_tmp2, 1)[:, None]
    tmp7 = tl.load(in_ptr1 + (x0), xmask)
    tmp3 = 25088.0
    tmp4 = tmp2 / tmp3
    tmp5 = 0.1
    tmp6 = tmp4 * tmp5
    tmp8 = 0.9
    tmp9 = tmp7 * tmp8
    tmp10 = tmp6 + tmp9
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
    _tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 196
        r2 = (rindex // 196)
        tmp11 = tl.load(in_ptr0 + (r1 + (196*x0) + (65856*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp12 = tmp11.to(tl.float32)
        tmp13 = tmp12 - tmp4
        tmp14 = tmp13 * tmp13
        _tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
    tmp15 = tl.sum(_tmp15, 1)[:, None]
    tl.store(out_ptr1 + x0, tmp15, xmask)
    tmp25 = tl.load(in_ptr2 + (x0), xmask)
    tmp16 = 25088.0
    tmp17 = tmp15 / tmp16
    tmp18 = 1e-05
    tmp19 = tmp17 + tmp18
    tmp20 = tl.math.rsqrt(tmp19)
    tmp21 = 1.0000398612827361
    tmp22 = tmp17 * tmp21
    tmp23 = 0.1
    tmp24 = tmp22 * tmp23
    tmp26 = 0.9
    tmp27 = tmp25 * tmp26
    tmp28 = tmp24 + tmp27
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
    tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)


def get_args():
    arg_0 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_102.run(*args, 336, 25088, grid=grid(336), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_102.benchmark_all_configs(*args, 336, 25088, grid=grid(336))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/zj/czji7shyj4kk6bl2rcdf5euq7ku4vpm3paub7sdz5rtqhc3o2xxw.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu

# aten._native_batch_norm_legit_functional => add_116, add_119, convert_element_type_130, convert_element_type_131, mul_171, mul_177, rsqrt_22, sub_22, var_mean_22
# aten.mean => mean_4
# aten.silu => convert_element_type_132, convert_element_type_133, mul_178, sigmoid_17
triton_per_fused__native_batch_norm_legit_functional_mean_silu_103 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[65536, 256],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_mean_silu_103(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 43008
    rnumel = 196
    RBLOCK: tl.constexpr = 256
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r2 = rindex
    x3 = xindex
    x0 = xindex % 336
    tmp0 = tl.load(in_ptr0 + (r2 + (196*x3)), rmask, other=0).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x0), None)
    tmp4 = tl.load(in_ptr2 + (x0), None)
    tmp11 = tl.load(in_ptr3 + (x0), None)
    tmp13 = tl.load(in_ptr4 + (x0), None)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 25088.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tmp16 = tmp15.to(tl.float32)
    tmp17 = tl.sigmoid(tmp16)
    tmp18 = tmp16 * tmp17
    tmp19 = tmp18.to(tl.float32)
    tmp20 = tmp19.to(tl.float32)
    tmp22 = tl.where(rmask, tmp20, 0)
    tmp23 = tl.sum(tmp22, 1)[:, None]
    tmp24 = 196.0
    tmp25 = tmp23 / tmp24
    tmp26 = tmp25.to(tl.float32)
    tl.store(out_ptr0 + (r2 + (196*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask)
    tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None)


def get_args():
    arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_6 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_mean_silu_103.run(*args, 43008, 196, grid=grid(43008), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_mean_silu_103.benchmark_all_configs(*args, 43008, 196, grid=grid(43008))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/vd/cvdkhtrpiolmsts2zro46mftpapkihzk36p6zhnrps3p4cq3krmr.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_135
triton_poi_fused__to_copy_104 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_104(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 4704
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((14, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((14, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_104.run(*args, 4704, grid=grid(4704), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_104.benchmark_all_configs(*args, 4704, grid=grid(4704))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/tr/ctr74i6h7fbsohivhhoxxpx2ftnqmpxexofhzvpjxs3qwsbp6lt2.py
# Original ATen: aten._to_copy, aten.convolution

# aten._to_copy => convert_element_type_134
# aten.convolution => convolution_51
triton_poi_fused__to_copy_convolution_105 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_105(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 14
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((14,), (1,), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((14,), (1,), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_convolution_105.run(*args, 14, grid=grid(14), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_convolution_105.benchmark_all_configs(*args, 14, grid=grid(14))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ko/ckojvusnljgriux3ampytpmwhilvjdtesa7avk7kubgd6p562p34.py
# Original ATen: aten._to_copy, aten.convolution, aten.silu

# aten._to_copy => convert_element_type_134
# aten.convolution => convolution_51
# aten.silu => convert_element_type_136, convert_element_type_137, mul_179, sigmoid_18
triton_poi_fused__to_copy_convolution_silu_106 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_silu_106(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1792
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 14
    tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32)
    tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp2 = tmp0 + tmp1
    tmp3 = tmp2.to(tl.float32)
    tmp4 = tl.sigmoid(tmp3)
    tmp5 = tmp3 * tmp4
    tmp6 = tmp5.to(tl.float32)
    tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, xmask)


def get_args():
    arg_0 = rand_strided((128, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((14,), (1,), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_convolution_silu_106.run(*args, 1792, grid=grid(1792), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_convolution_silu_106.benchmark_all_configs(*args, 1792, grid=grid(1792))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/wd/cwdr2zuaja73on5ahrizfe4fw5rmvg66qvnjezuugxyt6mvgktuo.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_139
triton_poi_fused__to_copy_107 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_107(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 4704
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((336, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((336, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_107.run(*args, 4704, grid=grid(4704), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_107.benchmark_all_configs(*args, 4704, grid=grid(4704))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/i4/ci4cud6xszhpzrrtirjqfbkilqlzpxjq4f4rcbjmyxesmegdihcj.py
# Original ATen: aten.mul, aten.sigmoid, aten.silu

# aten.mul => mul_180
# aten.sigmoid => sigmoid_19
# aten.silu => convert_element_type_132, convert_element_type_133, mul_178, sigmoid_17
triton_poi_fused_mul_sigmoid_silu_108 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused_mul_sigmoid_silu_108(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 8429568
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x1 = (xindex // 196)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tmp6 = tl.sigmoid(tmp5)
    tmp7 = tmp4 * tmp6
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None)


def get_args():
    arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_mul_sigmoid_silu_108.run(*args, 8429568, grid=grid(8429568), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_mul_sigmoid_silu_108.benchmark_all_configs(*args, 8429568, grid=grid(8429568))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/am/camchutakzc7ufjmewydlcbxzwspii4jktvvytxzvdtkijmunoki.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_140
triton_poi_fused__to_copy_109 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_109(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 34944
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((104, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((104, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_109.run(*args, 34944, grid=grid(34944), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_109.benchmark_all_configs(*args, 34944, grid=grid(34944))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/yx/cyxdjlki3rk4s5ktbskifaxqivcqtfxhzou2fgeig544h2budbbn.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => convert_element_type_141, var_mean_23
triton_red_fused__native_batch_norm_legit_functional_110 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[512, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_110(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 416
    rnumel = 6272
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex % 104
    x1 = (xindex // 104)
    _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = tl.load(in_ptr0 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
    tmp2 = tl.sum(_tmp2, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp2, xmask)


def get_args():
    arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 104, 1, 1, 4), (416, 1, 416, 416, 104), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_110.run(*args, 416, 6272, grid=grid(416), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_110.benchmark_all_configs(*args, 416, 6272, grid=grid(416))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ns/cnsyz4u2t4xdpizytxz7kuybbnglpm544k5kmwi5e6mw37eoeeyp.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_122, convert_element_type_141, mul_182, mul_183, var_mean_23
triton_per_fused__native_batch_norm_legit_functional_111 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[128, 4],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_111(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 104
    rnumel = 4
    RBLOCK: tl.constexpr = 4
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (104*r1)), rmask & xmask, other=0)
    tmp8 = tl.load(in_ptr1 + (x0), xmask)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = 25088.0
    tmp5 = tmp3 / tmp4
    tmp6 = 0.1
    tmp7 = tmp5 * tmp6
    tmp9 = 0.9
    tmp10 = tmp8 * tmp9
    tmp11 = tmp7 + tmp10
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask)


def get_args():
    arg_0 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((1, 104, 1, 1, 4), (416, 1, 416, 416, 104), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_111.run(*args, 104, 4, grid=grid(104), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_111.benchmark_all_configs(*args, 104, 4, grid=grid(104))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/vx/cvx6izhz5yqxyjo6cfzkguw47brvifvvxn3jp6fmmqpamdr6rdo7.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => convert_element_type_141, var_mean_23
triton_red_fused__native_batch_norm_legit_functional_112 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[512, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_112(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 416
    rnumel = 6272
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex % 104
    x1 = (xindex // 104)
    tmp2 = tl.load(in_ptr1 + (x0), xmask)
    _tmp5 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = tl.load(in_ptr0 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        tmp3 = tmp1 - tmp2
        tmp4 = tmp3 * tmp3
        _tmp5 = tl.where(rmask & xmask, _tmp5 + tmp4, _tmp5)
    tmp5 = tl.sum(_tmp5, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp5, xmask)


def get_args():
    arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 104, 1, 1, 4), (416, 1, 416, 416, 104), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_112.run(*args, 416, 6272, grid=grid(416), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_112.benchmark_all_configs(*args, 416, 6272, grid=grid(416))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/cy/ccy7dhumtl6nx7vjrgetby6qbjqdcsh4fen2e264olgvvilit6pl.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_121, add_123, convert_element_type_141, mul_184, mul_185, mul_186, rsqrt_23, squeeze_70, var_mean_23
triton_per_fused__native_batch_norm_legit_functional_113 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[128, 4],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_113(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 104
    rnumel = 4
    RBLOCK: tl.constexpr = 4
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (104*r1)), rmask & xmask, other=0)
    tmp13 = tl.load(in_ptr1 + (x0), xmask)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = 25088.0
    tmp5 = tmp3 / tmp4
    tmp6 = 1e-05
    tmp7 = tmp5 + tmp6
    tmp8 = tl.math.rsqrt(tmp7)
    tmp9 = 1.0000398612827361
    tmp10 = tmp5 * tmp9
    tmp11 = 0.1
    tmp12 = tmp10 * tmp11
    tmp14 = 0.9
    tmp15 = tmp13 * tmp14
    tmp16 = tmp12 + tmp15
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask)
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask)
    tl.store(out_ptr0 + x0, tmp3, xmask)


def get_args():
    arg_0 = rand_strided((1, 104, 1, 1, 4), (416, 1, 416, 416, 104), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_113.run(*args, 104, 4, grid=grid(104), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_113.benchmark_all_configs(*args, 104, 4, grid=grid(104))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/xg/cxgqusw3xzpsm5ih6qyxh7qdzmvcnjj2evihvlzncrorcyi7altv.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_121, add_124, convert_element_type_141, convert_element_type_142, mul_181, mul_187, rsqrt_23, sub_23, var_mean_23
triton_poi_fused__native_batch_norm_legit_functional_114 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_114(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2609152
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 196) % 104
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x1), None)
    tmp4 = tl.load(in_ptr2 + (x1), None)
    tmp11 = tl.load(in_ptr3 + (x1), None)
    tmp13 = tl.load(in_ptr4 + (x1), None)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 25088.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None)


def get_args():
    arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_114.run(*args, 2609152, grid=grid(2609152), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_114.benchmark_all_configs(*args, 2609152, grid=grid(2609152))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/hr/chrygikuwep25dgozsipotfnih6lajyye2txuo4r7put7ywkdk3t.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_143
triton_poi_fused__to_copy_115 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_115(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 16224
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_115.run(*args, 16224, grid=grid(16224), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_115.benchmark_all_configs(*args, 16224, grid=grid(16224))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ca/ccaahgmwlqiraz4ivxpktecmbqhvntosz4cdfe5ghfjm4pe6pws6.py
# Original ATen: aten.cat

# aten.cat => cat_16
triton_poi_fused_cat_116 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_116(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 7827456
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 61152
    x1 = (xindex // 61152)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (122304*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 312, 14, 14), (61152, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 312, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_116.run(*args, 7827456, grid=grid(7827456), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_116.benchmark_all_configs(*args, 7827456, grid=grid(7827456))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/j3/cj3tf4apq2n25yzonidfbwyd3cn3jiyqodrw5hjo73xovhn5czvu.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_126, add_127, add_128, convert_element_type_145, mul_189, mul_190, mul_191, mul_192, mul_193, rsqrt_24, squeeze_73, var_mean_24
triton_red_fused__native_batch_norm_legit_functional_117 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[1024, 32768],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_117(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 624
    rnumel = 25088
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 196
        r2 = (rindex // 196)
        tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
    tmp2 = tl.sum(_tmp2, 1)[:, None]
    tmp7 = tl.load(in_ptr1 + (x0), xmask)
    tmp3 = 25088.0
    tmp4 = tmp2 / tmp3
    tmp5 = 0.1
    tmp6 = tmp4 * tmp5
    tmp8 = 0.9
    tmp9 = tmp7 * tmp8
    tmp10 = tmp6 + tmp9
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
    _tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 196
        r2 = (rindex // 196)
        tmp11 = tl.load(in_ptr0 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp12 = tmp11.to(tl.float32)
        tmp13 = tmp12 - tmp4
        tmp14 = tmp13 * tmp13
        _tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
    tmp15 = tl.sum(_tmp15, 1)[:, None]
    tl.store(out_ptr1 + x0, tmp15, xmask)
    tmp25 = tl.load(in_ptr2 + (x0), xmask)
    tmp16 = 25088.0
    tmp17 = tmp15 / tmp16
    tmp18 = 1e-05
    tmp19 = tmp17 + tmp18
    tmp20 = tl.math.rsqrt(tmp19)
    tmp21 = 1.0000398612827361
    tmp22 = tmp17 * tmp21
    tmp23 = 0.1
    tmp24 = tmp22 * tmp23
    tmp26 = 0.9
    tmp27 = tmp25 * tmp26
    tmp28 = tmp24 + tmp27
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
    tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)


def get_args():
    arg_0 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_117.run(*args, 624, 25088, grid=grid(624), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_117.benchmark_all_configs(*args, 624, 25088, grid=grid(624))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/yk/cyk2njvgthyqkaax3hrogqqjnq7t4z4h2bdza6wc2nyqhlzwiqur.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub

# aten._native_batch_norm_legit_functional => add_126, add_129, convert_element_type_145, convert_element_type_146, mul_188, mul_194, rsqrt_24, sub_24, var_mean_24
# aten.add => add_355
# aten.clone => clone_15
# aten.fill => full_like_32
# aten.mul => mul_907, mul_908
# aten.sigmoid => sigmoid_96
# aten.sub => sub_233
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_118 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_118(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
    xnumel = 15654912
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 196) % 624
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x1), None)
    tmp4 = tl.load(in_ptr2 + (x1), None)
    tmp11 = tl.load(in_ptr3 + (x1), None)
    tmp13 = tl.load(in_ptr4 + (x1), None)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 25088.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tmp16 = tl.sigmoid(tmp15)
    tmp17 = 1.0
    tmp18 = tmp17 - tmp16
    tmp19 = tmp15 * tmp18
    tmp20 = tmp19 + tmp17
    tmp21 = tmp16 * tmp20
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None)
    tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp21, None)


def get_args():
    arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_6 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_118.run(*args, 15654912, grid=grid(15654912), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_118.benchmark_all_configs(*args, 15654912, grid=grid(15654912))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/h4/ch4fzflvelcyupsdemcjv4coogzqeqjolfo3rcegkby4xe2z7cgp.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_149
triton_poi_fused__to_copy_119 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_119(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1404
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((156, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((156, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_119.run(*args, 1404, grid=grid(1404), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_119.benchmark_all_configs(*args, 1404, grid=grid(1404))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ee/ceehri7sldlkmfyc53ikeryr64g77qkjawr2ddxdwntxstm4glbe.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_146
triton_poi_fused_split_with_sizes_120 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_120(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3913728
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 30576
    x1 = (xindex // 30576)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (122304*x1)), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)


def get_args():
    arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_120.run(*args, 3913728, grid=grid(3913728), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_120.benchmark_all_configs(*args, 3913728, grid=grid(3913728))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/3q/c3qy34jaqk4nnzsuou7s5dwcbuoi5kctwalhcspkfthqj2nvhjzq.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_150
triton_poi_fused__to_copy_121 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_121(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3900
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((156, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((156, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_121.run(*args, 3900, grid=grid(3900), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_121.benchmark_all_configs(*args, 3900, grid=grid(3900))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/d2/cd2noiyayhlfce4vrflvnr5n745imwlcczcywnnle3qtso27avlv.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_151
triton_poi_fused_split_with_sizes_122 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_122(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3913728
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 30576
    x1 = (xindex // 30576)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (30576 + x0 + (122304*x1)), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)


def get_args():
    arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_122.run(*args, 3913728, grid=grid(3913728), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_122.benchmark_all_configs(*args, 3913728, grid=grid(3913728))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/on/conrh3mxhtsmg5nvtksxdme55y6bhfvwrrnpp6mhmwmulosn5uyv.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_151
triton_poi_fused__to_copy_123 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_123(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 7644
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((156, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((156, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_123.run(*args, 7644, grid=grid(7644), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_123.benchmark_all_configs(*args, 7644, grid=grid(7644))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/nc/cncwy3tqtyzgd3muyfflcfni7qls4ttqyb4sac6cmg6znll2bcqu.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_156
triton_poi_fused_split_with_sizes_124 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_124(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3913728
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 30576
    x1 = (xindex // 30576)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (61152 + x0 + (122304*x1)), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)


def get_args():
    arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_124.run(*args, 3913728, grid=grid(3913728), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_124.benchmark_all_configs(*args, 3913728, grid=grid(3913728))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/yy/cyyxm3mybqiv5z5gofz4uhjttc6in5hwbrcbzt4xlp4z5lqyiqvr.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_152
triton_poi_fused__to_copy_125 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_125(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 12636
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((156, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((156, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_125.run(*args, 12636, grid=grid(12636), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_125.benchmark_all_configs(*args, 12636, grid=grid(12636))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ta/ctar2nhvp2hnx6l3xel6gxwicmmw5nlhbt33l63xfadahaihh7qo.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_161
triton_poi_fused_split_with_sizes_126 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_126(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3913728
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 30576
    x1 = (xindex // 30576)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (91728 + x0 + (122304*x1)), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)


def get_args():
    arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_126.run(*args, 3913728, grid=grid(3913728), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_126.benchmark_all_configs(*args, 3913728, grid=grid(3913728))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/aa/caaej5ipewhg7u3hng5mw3omnmn72k6eam7aoq4mbk5yqdeq7wwb.py
# Original ATen: aten.cat

# aten.cat => cat_17
triton_poi_fused_cat_127 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_127(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3913728
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 30576
    x1 = (xindex // 30576)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (122304*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 156, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_127.run(*args, 3913728, grid=grid(3913728), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_127.benchmark_all_configs(*args, 3913728, grid=grid(3913728))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/xm/cxmjpqggq2cokpfycochmecokolhxsamegk6vtmem3x47oz5ss3r.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu

# aten._native_batch_norm_legit_functional => add_131, add_134, convert_element_type_153, convert_element_type_154, mul_196, mul_202, rsqrt_25, sub_25, var_mean_25
# aten.mean => mean_5
# aten.silu => convert_element_type_155, convert_element_type_156, mul_203, sigmoid_21
triton_per_fused__native_batch_norm_legit_functional_mean_silu_128 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[131072, 256],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_mean_silu_128(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 79872
    rnumel = 196
    RBLOCK: tl.constexpr = 256
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r2 = rindex
    x3 = xindex
    x0 = xindex % 624
    tmp0 = tl.load(in_ptr0 + (r2 + (196*x3)), rmask, other=0).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x0), None)
    tmp4 = tl.load(in_ptr2 + (x0), None)
    tmp11 = tl.load(in_ptr3 + (x0), None)
    tmp13 = tl.load(in_ptr4 + (x0), None)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 25088.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tmp16 = tmp15.to(tl.float32)
    tmp17 = tl.sigmoid(tmp16)
    tmp18 = tmp16 * tmp17
    tmp19 = tmp18.to(tl.float32)
    tmp20 = tmp19.to(tl.float32)
    tmp22 = tl.where(rmask, tmp20, 0)
    tmp23 = tl.sum(tmp22, 1)[:, None]
    tmp24 = 196.0
    tmp25 = tmp23 / tmp24
    tmp26 = tmp25.to(tl.float32)
    tl.store(out_ptr0 + (r2 + (196*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask)
    tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None)


def get_args():
    arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_6 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_mean_silu_128.run(*args, 79872, 196, grid=grid(79872), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_mean_silu_128.benchmark_all_configs(*args, 79872, 196, grid=grid(79872))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/5n/c5nuz3wivd2kux327s5ffdifgf3i65ukistqypmzli4ipcs5fhup.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_158
triton_poi_fused__to_copy_129 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_129(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 16224
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((26, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((26, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_129.run(*args, 16224, grid=grid(16224), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_129.benchmark_all_configs(*args, 16224, grid=grid(16224))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/7u/c7uwxkw3ipfjfdehvzykp4imfsjz53vfoikfvt4ys7uwsknqbbb2.py
# Original ATen: aten._to_copy, aten.convolution

# aten._to_copy => convert_element_type_157
# aten.convolution => convolution_60
triton_poi_fused__to_copy_convolution_130 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[32], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_130(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 26
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((26,), (1,), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((26,), (1,), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_convolution_130.run(*args, 26, grid=grid(26), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_convolution_130.benchmark_all_configs(*args, 26, grid=grid(26))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/qj/cqjs4oas7ss654pge5hzaweuxapq6ge5y5fvo2ygxf7r6qylld6m.py
# Original ATen: aten._to_copy, aten.convolution, aten.silu

# aten._to_copy => convert_element_type_157
# aten.convolution => convolution_60
# aten.silu => convert_element_type_159, convert_element_type_160, mul_204, sigmoid_22
triton_poi_fused__to_copy_convolution_silu_131 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_silu_131(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3328
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 26
    tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32)
    tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp2 = tmp0 + tmp1
    tmp3 = tmp2.to(tl.float32)
    tmp4 = tl.sigmoid(tmp3)
    tmp5 = tmp3 * tmp4
    tmp6 = tmp5.to(tl.float32)
    tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, xmask)


def get_args():
    arg_0 = rand_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((26,), (1,), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_convolution_silu_131.run(*args, 3328, grid=grid(3328), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_convolution_silu_131.benchmark_all_configs(*args, 3328, grid=grid(3328))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/af/cafnwdqexuqbr3u7uonjmdp2h6gms65ppr5ersr7levxxzbxsags.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_162
triton_poi_fused__to_copy_132 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_132(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 16224
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((624, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((624, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_132.run(*args, 16224, grid=grid(16224), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_132.benchmark_all_configs(*args, 16224, grid=grid(16224))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/qt/cqtzzxfgsd6elacosxssusc2qroqxqbum3szv62gh4gr6s5vzwmp.py
# Original ATen: aten._to_copy, aten.convolution

# aten._to_copy => convert_element_type_161
# aten.convolution => convolution_61
triton_poi_fused__to_copy_convolution_133 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_133(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 624
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_convolution_133.run(*args, 624, grid=grid(624), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_convolution_133.benchmark_all_configs(*args, 624, grid=grid(624))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/hd/chd7xmae5qozjrly7porkx7hggrm6fh6klnsko3rzc52ohjqiyou.py
# Original ATen: aten._to_copy, aten.convolution

# aten._to_copy => convert_element_type_161
# aten.convolution => convolution_61
triton_poi_fused__to_copy_convolution_134 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_134(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 79872
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 624
    tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)
    tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
    tmp2 = tmp0 + tmp1
    tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None)


def get_args():
    arg_0 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_convolution_134.run(*args, 79872, grid=grid(79872), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_convolution_134.benchmark_all_configs(*args, 79872, grid=grid(79872))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/mn/cmnvsfbxocmtd6hcxhi5n75ltfyzsnegxzpgv7o57n6ksfhmrkay.py
# Original ATen: aten.mul, aten.sigmoid, aten.silu

# aten.mul => mul_205
# aten.sigmoid => sigmoid_23
# aten.silu => convert_element_type_155, convert_element_type_156, mul_203, sigmoid_21
triton_poi_fused_mul_sigmoid_silu_135 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused_mul_sigmoid_silu_135(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 15654912
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x1 = (xindex // 196)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tmp6 = tl.sigmoid(tmp5)
    tmp7 = tmp4 * tmp6
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None)


def get_args():
    arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_mul_sigmoid_silu_135.run(*args, 15654912, grid=grid(15654912), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_mul_sigmoid_silu_135.benchmark_all_configs(*args, 15654912, grid=grid(15654912))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/x6/cx65e4iuztiipxmj3kt6pdmtbqk6eud6jbchpllpc6tinjqjiko7.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_164
triton_poi_fused_split_with_sizes_136 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_136(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 7827456
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 61152
    x1 = (xindex // 61152)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (122304*x1)), None).to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 312, 14, 14), (61152, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_136.run(*args, 7827456, grid=grid(7827456), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_136.benchmark_all_configs(*args, 7827456, grid=grid(7827456))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/la/cla6pjsd7zhhqofhjak5ak67vzu4puphrosszybe7he2qayusdf7.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_165
triton_poi_fused_split_with_sizes_137 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_137(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 7827456
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 61152
    x1 = (xindex // 61152)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (61152 + x0 + (122304*x1)), None).to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 312, 14, 14), (61152, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_137.run(*args, 7827456, grid=grid(7827456), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_137.benchmark_all_configs(*args, 7827456, grid=grid(7827456))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/hz/chz3fw3o5fqe2cnzuiuggnyrnh3xzhhrw266gsgpwo3xgo4buev6.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_163
triton_poi_fused__to_copy_138 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_138(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 16224
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_138.run(*args, 16224, grid=grid(16224), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_138.benchmark_all_configs(*args, 16224, grid=grid(16224))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/vk/cvkwpn3gas36vom5boko6m7yjnsu4h3jn3s2dj74g4puyptio4dc.py
# Original ATen: aten.cat

# aten.cat => cat_18
triton_poi_fused_cat_139 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_139(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1304576
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 10192
    x1 = (xindex // 10192)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (20384*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 52, 14, 14), (10192, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 52, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_139.run(*args, 1304576, grid=grid(1304576), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_139.benchmark_all_configs(*args, 1304576, grid=grid(1304576))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/qp/cqptss63kr7w22hqbaitci52teddi4haqxu6zws6bzlk7k3r4p5w.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add

# aten._native_batch_norm_legit_functional => add_136, add_139, convert_element_type_165, convert_element_type_166, mul_206, mul_212, rsqrt_26, sub_26, var_mean_26
# aten.add => add_140
triton_poi_fused__native_batch_norm_legit_functional_add_140 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_140(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2609152
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 196) % 104
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x1), None)
    tmp4 = tl.load(in_ptr2 + (x1), None)
    tmp11 = tl.load(in_ptr3 + (x1), None)
    tmp13 = tl.load(in_ptr4 + (x1), None)
    tmp16 = tl.load(in_ptr5 + (x3), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 25088.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tmp17 = tmp15 + tmp16
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp17, None)


def get_args():
    arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_6 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_140.run(*args, 2609152, grid=grid(2609152), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_140.benchmark_all_configs(*args, 2609152, grid=grid(2609152))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/zf/czfuu6dquzuudpwuu3bwtsd76hxju2ps43qrqh5npmmaayzjihi2.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_215
triton_poi_fused__to_copy_141 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_141(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 64896
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((624, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((624, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_141.run(*args, 64896, grid=grid(64896), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_141.benchmark_all_configs(*args, 64896, grid=grid(64896))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/hj/chjvw2ozl3ipjwmejxog5ggykkaa6pfst7j432f3sbhnt2kvth4p.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.silu, aten.sub

# aten._native_batch_norm_legit_functional => add_174, add_177, convert_element_type_216, convert_element_type_217, mul_263, mul_269, rsqrt_33, sub_33, var_mean_33
# aten.add => add_341
# aten.clone => clone_24
# aten.fill => full_like_23
# aten.mul => mul_787, mul_788
# aten.sigmoid => sigmoid_87
# aten.silu => convert_element_type_218, convert_element_type_219, mul_270, sigmoid_32
# aten.sub => sub_185
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_silu_sub_142 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_silu_sub_142(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr1, out_ptr2, xnumel, XBLOCK : tl.constexpr):
    xnumel = 15654912
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 196) % 624
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x1), None)
    tmp4 = tl.load(in_ptr2 + (x1), None)
    tmp11 = tl.load(in_ptr3 + (x1), None)
    tmp13 = tl.load(in_ptr4 + (x1), None)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 25088.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tmp16 = tmp15.to(tl.float32)
    tmp17 = tl.sigmoid(tmp16)
    tmp18 = tmp16 * tmp17
    tmp19 = tmp18.to(tl.float32)
    tmp20 = tl.sigmoid(tmp15)
    tmp21 = 1.0
    tmp22 = tmp21 - tmp20
    tmp23 = tmp15 * tmp22
    tmp24 = tmp23 + tmp21
    tmp25 = tmp20 * tmp24
    tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp19, None)
    tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp25, None)


def get_args():
    arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_6 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_silu_sub_142.run(*args, 15654912, grid=grid(15654912), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_silu_sub_142.benchmark_all_configs(*args, 15654912, grid=grid(15654912))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/by/cbyy7pyyeky2s6spdugmwrwqdvrjfdr7qwcxwapwzhywwhng5ujh.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_220
triton_poi_fused__to_copy_143 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_143(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 5616
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((624, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((624, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_143.run(*args, 5616, grid=grid(5616), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_143.benchmark_all_configs(*args, 5616, grid=grid(5616))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ji/cjiljcgh2wc2yighvqz76tkwzti3zjjxpno5dng2ammsgvfhwvv7.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_226
triton_poi_fused__to_copy_144 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_144(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 32448
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((52, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((52, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_144.run(*args, 32448, grid=grid(32448), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_144.benchmark_all_configs(*args, 32448, grid=grid(32448))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/tv/ctvzn4xpxjd5g7oyd7nntvpnuyovmntivt2fyxf53c3mblh6dshr.py
# Original ATen: aten._to_copy, aten.convolution

# aten._to_copy => convert_element_type_225
# aten.convolution => convolution_86
triton_poi_fused__to_copy_convolution_145 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[64], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_145(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 52
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((52,), (1,), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((52,), (1,), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_convolution_145.run(*args, 52, grid=grid(52), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_convolution_145.benchmark_all_configs(*args, 52, grid=grid(52))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/sf/csfcbn5kzqvvv6krkr7772b7z66pr6xsuxyhtbgglpb26gkfc5h3.py
# Original ATen: aten._to_copy, aten.convolution, aten.silu

# aten._to_copy => convert_element_type_225
# aten.convolution => convolution_86
# aten.silu => convert_element_type_227, convert_element_type_228, mul_279, sigmoid_34
triton_poi_fused__to_copy_convolution_silu_146 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_silu_146(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 6656
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 52
    tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32)
    tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp2 = tmp0 + tmp1
    tmp3 = tmp2.to(tl.float32)
    tmp4 = tl.sigmoid(tmp3)
    tmp5 = tmp3 * tmp4
    tmp6 = tmp5.to(tl.float32)
    tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, xmask)


def get_args():
    arg_0 = rand_strided((128, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((52,), (1,), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_convolution_silu_146.run(*args, 6656, grid=grid(6656), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_convolution_silu_146.benchmark_all_configs(*args, 6656, grid=grid(6656))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ik/cikbdagjlpyedk3af4it4ary3yx6g4gpfifz25vxc7r25q224d43.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_230
triton_poi_fused__to_copy_147 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_147(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 32448
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((624, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((624, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_147.run(*args, 32448, grid=grid(32448), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_147.benchmark_all_configs(*args, 32448, grid=grid(32448))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/xb/cxb5poqew2wpvurnixpesahtvwzfiadakefnqud2hzokvcgnemog.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_231
triton_poi_fused__to_copy_148 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_148(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 99840
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((160, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((160, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_148.run(*args, 99840, grid=grid(99840), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_148.benchmark_all_configs(*args, 99840, grid=grid(99840))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/z7/cz7tf27fdbr7sfmkjktkou2imnr2mnlt7hicdhvs4chclhrw6j37.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => convert_element_type_232, var_mean_35
triton_red_fused__native_batch_norm_legit_functional_149 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[1024, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_149(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 640
    rnumel = 6272
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex % 160
    x1 = (xindex // 160)
    _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = tl.load(in_ptr0 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
    tmp2 = tl.sum(_tmp2, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp2, xmask)


def get_args():
    arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 160, 1, 1, 4), (640, 1, 640, 640, 160), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_149.run(*args, 640, 6272, grid=grid(640), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_149.benchmark_all_configs(*args, 640, 6272, grid=grid(640))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/hi/chiuub4omrsfimb4p75xk4vlbwv5mhljjinvhtfc3kqhmfnsyq4q.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_185, convert_element_type_232, mul_282, mul_283, var_mean_35
triton_per_fused__native_batch_norm_legit_functional_150 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[256, 4],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_150(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 160
    rnumel = 4
    RBLOCK: tl.constexpr = 4
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (160*r1)), rmask & xmask, other=0)
    tmp8 = tl.load(in_ptr1 + (x0), xmask)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = 25088.0
    tmp5 = tmp3 / tmp4
    tmp6 = 0.1
    tmp7 = tmp5 * tmp6
    tmp9 = 0.9
    tmp10 = tmp8 * tmp9
    tmp11 = tmp7 + tmp10
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask)


def get_args():
    arg_0 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((1, 160, 1, 1, 4), (640, 1, 640, 640, 160), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_150.run(*args, 160, 4, grid=grid(160), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_150.benchmark_all_configs(*args, 160, 4, grid=grid(160))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/4b/c4bvtzglf3eflssbrfazd6mjdgmrhoujx4xq6fve2tfmkwyeseru.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => convert_element_type_232, var_mean_35
triton_red_fused__native_batch_norm_legit_functional_151 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[1024, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_151(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 640
    rnumel = 6272
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex % 160
    x1 = (xindex // 160)
    tmp2 = tl.load(in_ptr1 + (x0), xmask)
    _tmp5 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    x3 = xindex
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r2 = rindex
        tmp0 = tl.load(in_ptr0 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        tmp3 = tmp1 - tmp2
        tmp4 = tmp3 * tmp3
        _tmp5 = tl.where(rmask & xmask, _tmp5 + tmp4, _tmp5)
    tmp5 = tl.sum(_tmp5, 1)[:, None]
    tl.store(out_ptr0 + x3, tmp5, xmask)


def get_args():
    arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 160, 1, 1, 4), (640, 1, 640, 640, 160), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_151.run(*args, 640, 6272, grid=grid(640), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_151.benchmark_all_configs(*args, 640, 6272, grid=grid(640))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/vo/cvo4hmhbgzgrpmknfykmtsywf2s2dd4n7xkbf6ll6hxoj5qx3zny.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_184, add_186, convert_element_type_232, mul_284, mul_285, mul_286, rsqrt_35, squeeze_106, var_mean_35
triton_per_fused__native_batch_norm_legit_functional_152 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[256, 4],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_152(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 160
    rnumel = 4
    RBLOCK: tl.constexpr = 4
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (160*r1)), rmask & xmask, other=0)
    tmp13 = tl.load(in_ptr1 + (x0), xmask)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = 25088.0
    tmp5 = tmp3 / tmp4
    tmp6 = 1e-05
    tmp7 = tmp5 + tmp6
    tmp8 = tl.math.rsqrt(tmp7)
    tmp9 = 1.0000398612827361
    tmp10 = tmp5 * tmp9
    tmp11 = 0.1
    tmp12 = tmp10 * tmp11
    tmp14 = 0.9
    tmp15 = tmp13 * tmp14
    tmp16 = tmp12 + tmp15
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask)
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask)
    tl.store(out_ptr0 + x0, tmp3, xmask)


def get_args():
    arg_0 = rand_strided((1, 160, 1, 1, 4), (640, 1, 640, 640, 160), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_152.run(*args, 160, 4, grid=grid(160), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_152.benchmark_all_configs(*args, 160, 4, grid=grid(160))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/7r/c7rznphkbqtbupvosh2bugs6qz6cx4gcbzlpwzvb44okdkqoc7am.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_184, add_187, convert_element_type_232, convert_element_type_233, mul_281, mul_287, rsqrt_35, sub_35, var_mean_35
triton_poi_fused__native_batch_norm_legit_functional_153 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_153(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 4014080
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 196) % 160
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x1), None)
    tmp4 = tl.load(in_ptr2 + (x1), None)
    tmp11 = tl.load(in_ptr3 + (x1), None)
    tmp13 = tl.load(in_ptr4 + (x1), None)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 25088.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None)


def get_args():
    arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_153.run(*args, 4014080, grid=grid(4014080), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_153.benchmark_all_configs(*args, 4014080, grid=grid(4014080))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/25/c25hctynyjlhient6ceczvttfs4npfgnfoty5zhk6pn4ckhqm3ka.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_234
triton_poi_fused__to_copy_154 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_154(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 19200
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_154.run(*args, 19200, grid=grid(19200), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_154.benchmark_all_configs(*args, 19200, grid=grid(19200))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/yz/cyzjinfa2nwvymmtkqoovwkfypkotg6ek2sunwsuf3vfd35r7l3k.py
# Original ATen: aten.cat

# aten.cat => cat_25
triton_poi_fused_cat_155 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_155(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 6021120
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 47040
    x1 = (xindex // 47040)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (94080*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 240, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_155.run(*args, 6021120, grid=grid(6021120), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_155.benchmark_all_configs(*args, 6021120, grid=grid(6021120))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ks/cksikkrsggj4vc4g42rphhvyb3ox3tanqawsqpy2rpo5qbqc3tin.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_189, add_190, add_191, convert_element_type_236, mul_289, mul_290, mul_291, mul_292, mul_293, rsqrt_36, squeeze_109, var_mean_36
triton_red_fused__native_batch_norm_legit_functional_156 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[512, 32768],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_156(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 480
    rnumel = 25088
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 196
        r2 = (rindex // 196)
        tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
    tmp2 = tl.sum(_tmp2, 1)[:, None]
    tmp7 = tl.load(in_ptr1 + (x0), xmask)
    tmp3 = 25088.0
    tmp4 = tmp2 / tmp3
    tmp5 = 0.1
    tmp6 = tmp4 * tmp5
    tmp8 = 0.9
    tmp9 = tmp7 * tmp8
    tmp10 = tmp6 + tmp9
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
    _tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 196
        r2 = (rindex // 196)
        tmp11 = tl.load(in_ptr0 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp12 = tmp11.to(tl.float32)
        tmp13 = tmp12 - tmp4
        tmp14 = tmp13 * tmp13
        _tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
    tmp15 = tl.sum(_tmp15, 1)[:, None]
    tl.store(out_ptr1 + x0, tmp15, xmask)
    tmp25 = tl.load(in_ptr2 + (x0), xmask)
    tmp16 = 25088.0
    tmp17 = tmp15 / tmp16
    tmp18 = 1e-05
    tmp19 = tmp17 + tmp18
    tmp20 = tl.math.rsqrt(tmp19)
    tmp21 = 1.0000398612827361
    tmp22 = tmp17 * tmp21
    tmp23 = 0.1
    tmp24 = tmp22 * tmp23
    tmp26 = 0.9
    tmp27 = tmp25 * tmp26
    tmp28 = tmp24 + tmp27
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
    tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)


def get_args():
    arg_0 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_156.run(*args, 480, 25088, grid=grid(480), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_156.benchmark_all_configs(*args, 480, 25088, grid=grid(480))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/uq/cuq6a4x4agmkmfpm3znliwswlpgc26f3bnjc5ylnl5l6aeuugh6f.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub

# aten._native_batch_norm_legit_functional => add_189, add_192, convert_element_type_236, convert_element_type_237, mul_288, mul_294, rsqrt_36, sub_36, var_mean_36
# aten.add => add_336
# aten.clone => clone_27
# aten.fill => full_like_20
# aten.mul => mul_747, mul_748
# aten.sigmoid => sigmoid_84
# aten.sub => sub_169
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_157 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_157(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
    xnumel = 12042240
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 196) % 480
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x1), None)
    tmp4 = tl.load(in_ptr2 + (x1), None)
    tmp11 = tl.load(in_ptr3 + (x1), None)
    tmp13 = tl.load(in_ptr4 + (x1), None)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 25088.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tmp16 = tl.sigmoid(tmp15)
    tmp17 = 1.0
    tmp18 = tmp17 - tmp16
    tmp19 = tmp15 * tmp18
    tmp20 = tmp19 + tmp17
    tmp21 = tmp16 * tmp20
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None)
    tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp21, None)


def get_args():
    arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_6 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_157.run(*args, 12042240, grid=grid(12042240), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_157.benchmark_all_configs(*args, 12042240, grid=grid(12042240))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/hb/chbvxlmers2jwf43nczkvjub2nxsnhvu5csbgr2u6jwg4rlcdpzl.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_242
triton_poi_fused_split_with_sizes_158 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_158(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3010560
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 23520
    x1 = (xindex // 23520)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (94080*x1)), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)


def get_args():
    arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_158.run(*args, 3010560, grid=grid(3010560), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_158.benchmark_all_configs(*args, 3010560, grid=grid(3010560))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/cu/ccuyxkbjwe7ff4mqowk3q7to3cbjg3o7g2sh6doygm5y2bnlvlfi.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_241
triton_poi_fused__to_copy_159 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_159(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3000
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_159.run(*args, 3000, grid=grid(3000), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_159.benchmark_all_configs(*args, 3000, grid=grid(3000))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/dg/cdglxs23pt7h2wls62uy6ewinjtjctcradtpw76t3qcrvnhso4kr.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_247
triton_poi_fused_split_with_sizes_160 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_160(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3010560
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 23520
    x1 = (xindex // 23520)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (23520 + x0 + (94080*x1)), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)


def get_args():
    arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_160.run(*args, 3010560, grid=grid(3010560), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_160.benchmark_all_configs(*args, 3010560, grid=grid(3010560))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/v4/cv4uf7ojdb6edtvnqbvbxjvi624yvygrldwzdl6uy2gdzz3sghqp.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_242
triton_poi_fused__to_copy_161 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_161(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 5880
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_161.run(*args, 5880, grid=grid(5880), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_161.benchmark_all_configs(*args, 5880, grid=grid(5880))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/bn/cbnnjypiavc2wmccrce2li6mllqpp2n4naaetozpnboypgpizsqn.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_252
triton_poi_fused_split_with_sizes_162 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_162(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3010560
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 23520
    x1 = (xindex // 23520)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (47040 + x0 + (94080*x1)), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)


def get_args():
    arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_162.run(*args, 3010560, grid=grid(3010560), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_162.benchmark_all_configs(*args, 3010560, grid=grid(3010560))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/4m/c4mgwvuur32h35cj53cazh6flbatygb5baks32qmyo4zqc7drw5k.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_243
triton_poi_fused__to_copy_163 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_163(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 9720
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_163.run(*args, 9720, grid=grid(9720), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_163.benchmark_all_configs(*args, 9720, grid=grid(9720))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/4m/c4mepa4uirogjqorspxpl4t5fx2rzmjdzatpwzpg64qrcyuvg7m4.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_257
triton_poi_fused_split_with_sizes_164 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_164(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3010560
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 23520
    x1 = (xindex // 23520)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (70560 + x0 + (94080*x1)), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)


def get_args():
    arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_164.run(*args, 3010560, grid=grid(3010560), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_164.benchmark_all_configs(*args, 3010560, grid=grid(3010560))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/qv/cqvo24surldsdqwccrhisp4chcnkg2otiq5qykrtymzijwmytnb7.py
# Original ATen: aten.cat

# aten.cat => cat_26
triton_poi_fused_cat_165 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_165(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3010560
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 23520
    x1 = (xindex // 23520)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (94080*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 120, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_165.run(*args, 3010560, grid=grid(3010560), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_165.benchmark_all_configs(*args, 3010560, grid=grid(3010560))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/25/c25wuycbavajx7kwcvwpcba6kxp6j5oofnjkbplrrxckdmxy3shz.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu

# aten._native_batch_norm_legit_functional => add_194, add_197, convert_element_type_244, convert_element_type_245, mul_296, mul_302, rsqrt_37, sub_37, var_mean_37
# aten.mean => mean_9
# aten.silu => convert_element_type_246, convert_element_type_247, mul_303, sigmoid_37
triton_per_fused__native_batch_norm_legit_functional_mean_silu_166 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[65536, 256],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_mean_silu_166(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 61440
    rnumel = 196
    RBLOCK: tl.constexpr = 256
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r2 = rindex
    x3 = xindex
    x0 = xindex % 480
    tmp0 = tl.load(in_ptr0 + (r2 + (196*x3)), rmask, other=0).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x0), None)
    tmp4 = tl.load(in_ptr2 + (x0), None)
    tmp11 = tl.load(in_ptr3 + (x0), None)
    tmp13 = tl.load(in_ptr4 + (x0), None)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 25088.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tmp16 = tmp15.to(tl.float32)
    tmp17 = tl.sigmoid(tmp16)
    tmp18 = tmp16 * tmp17
    tmp19 = tmp18.to(tl.float32)
    tmp20 = tmp19.to(tl.float32)
    tmp22 = tl.where(rmask, tmp20, 0)
    tmp23 = tl.sum(tmp22, 1)[:, None]
    tmp24 = 196.0
    tmp25 = tmp23 / tmp24
    tmp26 = tmp25.to(tl.float32)
    tl.store(out_ptr0 + (r2 + (196*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask)
    tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None)


def get_args():
    arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_6 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_mean_silu_166.run(*args, 61440, 196, grid=grid(61440), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_mean_silu_166.benchmark_all_configs(*args, 61440, 196, grid=grid(61440))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/vh/cvhn4ymz4o7zw5xdmhfhtnvjkm7lfirplpypnuvceox4nfhcy7wo.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_249
triton_poi_fused__to_copy_167 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_167(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 38400
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_167.run(*args, 38400, grid=grid(38400), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_167.benchmark_all_configs(*args, 38400, grid=grid(38400))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/uc/cuc5yc2wu6mlfc426p45ysrpjdvcf67n4fnoghr4zijaa4qc4jub.py
# Original ATen: aten._to_copy, aten.convolution

# aten._to_copy => convert_element_type_248
# aten.convolution => convolution_95
triton_poi_fused__to_copy_convolution_168 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[128], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_168(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 80
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((80,), (1,), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((80,), (1,), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_convolution_168.run(*args, 80, grid=grid(80), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_convolution_168.benchmark_all_configs(*args, 80, grid=grid(80))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/w4/cw4m6y7egnkzpotuw7yanoiegenydq7w76ehn3uqkz73dfsxrsvq.py
# Original ATen: aten._to_copy, aten.convolution, aten.silu

# aten._to_copy => convert_element_type_248
# aten.convolution => convolution_95
# aten.silu => convert_element_type_250, convert_element_type_251, mul_304, sigmoid_38
triton_poi_fused__to_copy_convolution_silu_169 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_silu_169(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 10240
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 80
    tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)
    tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
    tmp2 = tmp0 + tmp1
    tmp3 = tmp2.to(tl.float32)
    tmp4 = tl.sigmoid(tmp3)
    tmp5 = tmp3 * tmp4
    tmp6 = tmp5.to(tl.float32)
    tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, None)


def get_args():
    arg_0 = rand_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((80,), (1,), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_convolution_silu_169.run(*args, 10240, grid=grid(10240), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_convolution_silu_169.benchmark_all_configs(*args, 10240, grid=grid(10240))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/5w/c5wu2byqoqpdpilokyxdylwqvzl3vlutvi3qed6ouxeny7v3ma4k.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_253
triton_poi_fused__to_copy_170 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_170(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 38400
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_170.run(*args, 38400, grid=grid(38400), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_170.benchmark_all_configs(*args, 38400, grid=grid(38400))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/t6/ct65ncuugfm3kp7u2z2xcftkozxdb7egjj3feed7rrbxlnc25mxn.py
# Original ATen: aten._to_copy, aten.convolution

# aten._to_copy => convert_element_type_252
# aten.convolution => convolution_96
triton_poi_fused__to_copy_convolution_171 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[512], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_171(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 480
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_convolution_171.run(*args, 480, grid=grid(480), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_convolution_171.benchmark_all_configs(*args, 480, grid=grid(480))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/q3/cq36nnpdlb4xwdvwsiktoah4r62ugwqlfeikgt4twhaobbsdqnku.py
# Original ATen: aten._to_copy, aten.convolution

# aten._to_copy => convert_element_type_252
# aten.convolution => convolution_96
triton_poi_fused__to_copy_convolution_172 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_172(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 61440
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 480
    tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)
    tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
    tmp2 = tmp0 + tmp1
    tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None)


def get_args():
    arg_0 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_convolution_172.run(*args, 61440, grid=grid(61440), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_convolution_172.benchmark_all_configs(*args, 61440, grid=grid(61440))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ce/cce2z44226346eg4knvdzt24sfsujmrpjne4vcofj37miaugwnek.py
# Original ATen: aten.mul, aten.sigmoid, aten.silu

# aten.mul => mul_305
# aten.sigmoid => sigmoid_39
# aten.silu => convert_element_type_246, convert_element_type_247, mul_303, sigmoid_37
triton_poi_fused_mul_sigmoid_silu_173 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused_mul_sigmoid_silu_173(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 12042240
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x1 = (xindex // 196)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tmp6 = tl.sigmoid(tmp5)
    tmp7 = tmp4 * tmp6
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None)


def get_args():
    arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_mul_sigmoid_silu_173.run(*args, 12042240, grid=grid(12042240), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_mul_sigmoid_silu_173.benchmark_all_configs(*args, 12042240, grid=grid(12042240))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/jt/cjtarvyynr3tkfvm37qgtcwaksnjyb62otlxal5l5thrlfpz4yoz.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_260
triton_poi_fused_split_with_sizes_174 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_174(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 6021120
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 47040
    x1 = (xindex // 47040)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (94080*x1)), None).to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_174.run(*args, 6021120, grid=grid(6021120), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_174.benchmark_all_configs(*args, 6021120, grid=grid(6021120))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/zb/czbuvo3mdbqgu4vhbh2rqqgxj7l4q2oef5prb5xafk4aclfa4wgv.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_261
triton_poi_fused_split_with_sizes_175 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_175(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 6021120
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 47040
    x1 = (xindex // 47040)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (47040 + x0 + (94080*x1)), None).to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_175.run(*args, 6021120, grid=grid(6021120), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_175.benchmark_all_configs(*args, 6021120, grid=grid(6021120))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/fo/cfobovxlpvp6vyrqqsfdz6jlgrpfwpemctlxrrgsykw3zih3en3k.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_254
triton_poi_fused__to_copy_176 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_176(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 19200
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_176.run(*args, 19200, grid=grid(19200), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_176.benchmark_all_configs(*args, 19200, grid=grid(19200))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/66/c66wd4retmi6azmfzfxmll5pmv3btijpcst56cykpuynmmhzhq23.py
# Original ATen: aten.cat

# aten.cat => cat_27
triton_poi_fused_cat_177 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_177(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2007040
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 15680
    x1 = (xindex // 15680)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (31360*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 80, 14, 14), (15680, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 80, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_177.run(*args, 2007040, grid=grid(2007040), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_177.benchmark_all_configs(*args, 2007040, grid=grid(2007040))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/4x/c4xzenlwjgchxhkagftgwbnakluheaf3fbwuqeozpqdbo7czlfp4.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add

# aten._native_batch_norm_legit_functional => add_199, add_202, convert_element_type_256, convert_element_type_257, mul_306, mul_312, rsqrt_38, sub_38, var_mean_38
# aten.add => add_203
triton_poi_fused__native_batch_norm_legit_functional_add_178 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_178(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 4014080
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 196) % 160
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x1), None)
    tmp4 = tl.load(in_ptr2 + (x1), None)
    tmp11 = tl.load(in_ptr3 + (x1), None)
    tmp13 = tl.load(in_ptr4 + (x1), None)
    tmp16 = tl.load(in_ptr5 + (x3), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 25088.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tmp17 = tmp15 + tmp16
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp17, None)


def get_args():
    arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_6 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_178.run(*args, 4014080, grid=grid(4014080), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_178.benchmark_all_configs(*args, 4014080, grid=grid(4014080))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/xt/cxt5mn4i2rohwchtz75fbvbykzhjdnjrf3qopbwsyyqyu5jqlzr6.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_306
triton_poi_fused__to_copy_179 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_179(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 153600
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), None)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None)


def get_args():
    arg_0 = rand_strided((960, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((960, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_179.run(*args, 153600, grid=grid(153600), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_179.benchmark_all_configs(*args, 153600, grid=grid(153600))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/g5/cg527ugqaap3ndd26uaeekojscsqxex2xmpgnu55nr6zwyzt6pfr.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_237, add_238, add_239, convert_element_type_307, mul_364, mul_365, mul_366, mul_367, mul_368, rsqrt_45, squeeze_136, var_mean_45
triton_red_fused__native_batch_norm_legit_functional_180 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[1024, 32768],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_180(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 960
    rnumel = 25088
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 196
        r2 = (rindex // 196)
        tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
    tmp2 = tl.sum(_tmp2, 1)[:, None]
    tmp7 = tl.load(in_ptr1 + (x0), xmask)
    tmp3 = 25088.0
    tmp4 = tmp2 / tmp3
    tmp5 = 0.1
    tmp6 = tmp4 * tmp5
    tmp8 = 0.9
    tmp9 = tmp7 * tmp8
    tmp10 = tmp6 + tmp9
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
    _tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 196
        r2 = (rindex // 196)
        tmp11 = tl.load(in_ptr0 + (r1 + (196*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp12 = tmp11.to(tl.float32)
        tmp13 = tmp12 - tmp4
        tmp14 = tmp13 * tmp13
        _tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
    tmp15 = tl.sum(_tmp15, 1)[:, None]
    tl.store(out_ptr1 + x0, tmp15, xmask)
    tmp25 = tl.load(in_ptr2 + (x0), xmask)
    tmp16 = 25088.0
    tmp17 = tmp15 / tmp16
    tmp18 = 1e-05
    tmp19 = tmp17 + tmp18
    tmp20 = tl.math.rsqrt(tmp19)
    tmp21 = 1.0000398612827361
    tmp22 = tmp17 * tmp21
    tmp23 = 0.1
    tmp24 = tmp22 * tmp23
    tmp26 = 0.9
    tmp27 = tmp25 * tmp26
    tmp28 = tmp24 + tmp27
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
    tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)


def get_args():
    arg_0 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_180.run(*args, 960, 25088, grid=grid(960), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_180.benchmark_all_configs(*args, 960, 25088, grid=grid(960))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/dy/cdyo4fykv4n564s3r3eahdxt2kdz67oeovon5bt5bw2lnq6iiowx.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub

# aten._native_batch_norm_legit_functional => add_237, add_240, convert_element_type_307, convert_element_type_308, mul_363, mul_369, rsqrt_45, sub_45, var_mean_45
# aten.add => add_322
# aten.clone => clone_36
# aten.fill => full_like_11
# aten.mul => mul_627, mul_628
# aten.sigmoid => sigmoid_75
# aten.sub => sub_121
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_181 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_181(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
    xnumel = 24084480
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 196) % 960
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x1), None)
    tmp4 = tl.load(in_ptr2 + (x1), None)
    tmp11 = tl.load(in_ptr3 + (x1), None)
    tmp13 = tl.load(in_ptr4 + (x1), None)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 25088.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tmp16 = tl.sigmoid(tmp15)
    tmp17 = 1.0
    tmp18 = tmp17 - tmp16
    tmp19 = tmp15 * tmp18
    tmp20 = tmp19 + tmp17
    tmp21 = tmp16 * tmp20
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None)
    tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp21, None)


def get_args():
    arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_6 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_181.run(*args, 24084480, grid=grid(24084480), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_181.benchmark_all_configs(*args, 24084480, grid=grid(24084480))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/3n/c3n35ckjp4qebx5kiwvfxztqxmlk7uwm2vsaphijnyencaajb4gh.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_311
triton_poi_fused__to_copy_182 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_182(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2160
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((240, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((240, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_182.run(*args, 2160, grid=grid(2160), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_182.benchmark_all_configs(*args, 2160, grid=grid(2160))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ow/cowqeeeuzgoqgwjjram7xczyaz465plzun33c7elszbys2k2zrzn.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_330
triton_poi_fused_split_with_sizes_183 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_183(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 6021120
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 47040
    x1 = (xindex // 47040)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (188160*x1)), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)


def get_args():
    arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_183.run(*args, 6021120, grid=grid(6021120), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_183.benchmark_all_configs(*args, 6021120, grid=grid(6021120))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/qx/cqxoadvgndjsqu4bltg3sq7l3fij2isfiihadwqnkb3ibmloxfyz.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_312
triton_poi_fused__to_copy_184 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_184(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 6000
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((240, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((240, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_184.run(*args, 6000, grid=grid(6000), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_184.benchmark_all_configs(*args, 6000, grid=grid(6000))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/vk/cvkedu5ynhqgwwdblwjx7rst7rz42xs7fl3vaof4iorofkc5cxam.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_335
triton_poi_fused_split_with_sizes_185 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_185(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 6021120
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 47040
    x1 = (xindex // 47040)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (47040 + x0 + (188160*x1)), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)


def get_args():
    arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_185.run(*args, 6021120, grid=grid(6021120), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_185.benchmark_all_configs(*args, 6021120, grid=grid(6021120))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/nv/cnv3es7i7p4naxsxsc56c2w7slsaxha5zh6gcrrgizefpo7zps5s.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_313
triton_poi_fused__to_copy_186 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_186(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 11760
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((240, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((240, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_186.run(*args, 11760, grid=grid(11760), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_186.benchmark_all_configs(*args, 11760, grid=grid(11760))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/3q/c3qkjjd4hxw6ylyfib2kl5k6u5atp2kcpplda3gzr6acmswhzitj.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_340
triton_poi_fused_split_with_sizes_187 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_187(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 6021120
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 47040
    x1 = (xindex // 47040)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (94080 + x0 + (188160*x1)), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)


def get_args():
    arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_187.run(*args, 6021120, grid=grid(6021120), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_187.benchmark_all_configs(*args, 6021120, grid=grid(6021120))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/l7/cl7v5hh5jck72nsc2u3t2ieraqxarpadxdpy5nt2hkbbqrgcinn7.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_314
triton_poi_fused__to_copy_188 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_188(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 19440
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((240, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((240, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_188.run(*args, 19440, grid=grid(19440), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_188.benchmark_all_configs(*args, 19440, grid=grid(19440))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/xm/cxm7drxzaido67j4aqf4vqqzkyw7ffgx32ygj3ddi3kmzigoauhg.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_345
triton_poi_fused_split_with_sizes_189 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_189(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 6021120
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 47040
    x1 = (xindex // 47040)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (141120 + x0 + (188160*x1)), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)


def get_args():
    arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_189.run(*args, 6021120, grid=grid(6021120), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_189.benchmark_all_configs(*args, 6021120, grid=grid(6021120))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ny/cnyr36xeex2s2grdckurzoliwfqkdakrsvrmgx5fd6rzjncjxedb.py
# Original ATen: aten.cat

# aten.cat => cat_34
triton_poi_fused_cat_190 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_190(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1505280
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 11760
    x1 = (xindex // 11760)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (47040*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)


def get_args():
    arg_0 = rand_strided((128, 240, 7, 7), (11760, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 240, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_190.run(*args, 1505280, grid=grid(1505280), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_190.benchmark_all_configs(*args, 1505280, grid=grid(1505280))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/z6/cz6ieqk3sfzltam3yeijfwdqrtao7w4zqdsqj32cpu6zoxg3irdq.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_242, add_243, add_244, convert_element_type_315, mul_372, mul_373, mul_374, mul_375, mul_376, rsqrt_46, squeeze_139, var_mean_46
triton_red_fused__native_batch_norm_legit_functional_191 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[1024, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_191(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 960
    rnumel = 6272
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 49
        r2 = (rindex // 49)
        tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (47040*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
    tmp2 = tl.sum(_tmp2, 1)[:, None]
    tmp7 = tl.load(in_ptr1 + (x0), xmask)
    tmp3 = 6272.0
    tmp4 = tmp2 / tmp3
    tmp5 = 0.1
    tmp6 = tmp4 * tmp5
    tmp8 = 0.9
    tmp9 = tmp7 * tmp8
    tmp10 = tmp6 + tmp9
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
    _tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 49
        r2 = (rindex // 49)
        tmp11 = tl.load(in_ptr0 + (r1 + (49*x0) + (47040*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp12 = tmp11.to(tl.float32)
        tmp13 = tmp12 - tmp4
        tmp14 = tmp13 * tmp13
        _tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
    tmp15 = tl.sum(_tmp15, 1)[:, None]
    tl.store(out_ptr1 + x0, tmp15, xmask)
    tmp25 = tl.load(in_ptr2 + (x0), xmask)
    tmp16 = 6272.0
    tmp17 = tmp15 / tmp16
    tmp18 = 1e-05
    tmp19 = tmp17 + tmp18
    tmp20 = tl.math.rsqrt(tmp19)
    tmp21 = 1.0001594642002871
    tmp22 = tmp17 * tmp21
    tmp23 = 0.1
    tmp24 = tmp22 * tmp23
    tmp26 = 0.9
    tmp27 = tmp25 * tmp26
    tmp28 = tmp24 + tmp27
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
    tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)


def get_args():
    arg_0 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_191.run(*args, 960, 6272, grid=grid(960), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_191.benchmark_all_configs(*args, 960, 6272, grid=grid(960))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/6y/c6ybh5xdfuwq6wdolxlmurgpxtgtn6q5shvsfwt5gf4rdpz7tas7.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu

# aten._native_batch_norm_legit_functional => add_242, add_245, convert_element_type_315, convert_element_type_316, mul_371, mul_377, rsqrt_46, sub_46, var_mean_46
# aten.mean => mean_12
# aten.silu => convert_element_type_317, convert_element_type_318, mul_378, sigmoid_49
triton_per_fused__native_batch_norm_legit_functional_mean_silu_192 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[131072, 64],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_mean_silu_192(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 122880
    rnumel = 49
    RBLOCK: tl.constexpr = 64
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r2 = rindex
    x3 = xindex
    x0 = xindex % 960
    tmp0 = tl.load(in_ptr0 + (r2 + (49*x3)), rmask, other=0).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x0), None)
    tmp4 = tl.load(in_ptr2 + (x0), None)
    tmp11 = tl.load(in_ptr3 + (x0), None)
    tmp13 = tl.load(in_ptr4 + (x0), None)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 6272.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tmp16 = tmp15.to(tl.float32)
    tmp17 = tl.sigmoid(tmp16)
    tmp18 = tmp16 * tmp17
    tmp19 = tmp18.to(tl.float32)
    tmp20 = tmp19.to(tl.float32)
    tmp22 = tl.where(rmask, tmp20, 0)
    tmp23 = tl.sum(tmp22, 1)[:, None]
    tmp24 = 49.0
    tmp25 = tmp23 / tmp24
    tmp26 = tmp25.to(tl.float32)
    tl.store(out_ptr0 + (r2 + (49*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask)
    tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None)


def get_args():
    arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_6 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_mean_silu_192.run(*args, 122880, 49, grid=grid(122880), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_mean_silu_192.benchmark_all_configs(*args, 122880, 49, grid=grid(122880))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/tl/ctlao5qdd2i3xcmliczx6rwjunhhywrgkmmolniswyndldbl73bm.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_320
triton_poi_fused__to_copy_193 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_193(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 76800
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((80, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((80, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_193.run(*args, 76800, grid=grid(76800), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_193.benchmark_all_configs(*args, 76800, grid=grid(76800))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/b4/cb4kobddnmpxrqkwjrmqktjtkp5r7xvtzsia2m5pxlvyngvl3gez.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_324
triton_poi_fused__to_copy_194 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_194(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 76800
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((960, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((960, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_194.run(*args, 76800, grid=grid(76800), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_194.benchmark_all_configs(*args, 76800, grid=grid(76800))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/5n/c5nkvlxqc6zdiwvvnuciiwj3rs2idxp5remuu4oe4qksftnrsipg.py
# Original ATen: aten._to_copy, aten.convolution

# aten._to_copy => convert_element_type_323
# aten.convolution => convolution_125
triton_poi_fused__to_copy_convolution_195 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_195(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 960
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_convolution_195.run(*args, 960, grid=grid(960), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_convolution_195.benchmark_all_configs(*args, 960, grid=grid(960))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/5g/c5gxudeqor4giltbrwmymcii3cqccfm32ibjrioytb2cth76irpn.py
# Original ATen: aten._to_copy, aten.convolution

# aten._to_copy => convert_element_type_323
# aten.convolution => convolution_125
triton_poi_fused__to_copy_convolution_196 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_196(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 122880
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 960
    tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)
    tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
    tmp2 = tmp0 + tmp1
    tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None)


def get_args():
    arg_0 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_convolution_196.run(*args, 122880, grid=grid(122880), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_convolution_196.benchmark_all_configs(*args, 122880, grid=grid(122880))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/u5/cu5blwuhanuznefsomv5iwcn6emdzdpn2ln3p27yycjvkiw6g2du.py
# Original ATen: aten.mul, aten.sigmoid, aten.silu

# aten.mul => mul_380
# aten.sigmoid => sigmoid_51
# aten.silu => convert_element_type_317, convert_element_type_318, mul_378, sigmoid_49
triton_poi_fused_mul_sigmoid_silu_197 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused_mul_sigmoid_silu_197(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 6021120
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x1 = (xindex // 49)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tmp6 = tl.sigmoid(tmp5)
    tmp7 = tmp4 * tmp6
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None)


def get_args():
    arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_mul_sigmoid_silu_197.run(*args, 6021120, grid=grid(6021120), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_mul_sigmoid_silu_197.benchmark_all_configs(*args, 6021120, grid=grid(6021120))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ow/cow7m27226bnhxi6lshzaavlwewllidrdyuy4wj4ablq5nvozskp.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_325
triton_poi_fused__to_copy_198 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_198(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 253440
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((264, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((264, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_198.run(*args, 253440, grid=grid(253440), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_198.benchmark_all_configs(*args, 253440, grid=grid(253440))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/zi/czig3biynqg6sd4t6h2tmyqez5ybs7vrp4v72f35wteotcxxgft7.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_247, add_248, add_249, convert_element_type_326, mul_382, mul_383, mul_384, mul_385, mul_386, rsqrt_47, squeeze_142, var_mean_47
triton_red_fused__native_batch_norm_legit_functional_199 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[512, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_199(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 264
    rnumel = 6272
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 49
        r2 = (rindex // 49)
        tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
    tmp2 = tl.sum(_tmp2, 1)[:, None]
    tmp7 = tl.load(in_ptr1 + (x0), xmask)
    tmp3 = 6272.0
    tmp4 = tmp2 / tmp3
    tmp5 = 0.1
    tmp6 = tmp4 * tmp5
    tmp8 = 0.9
    tmp9 = tmp7 * tmp8
    tmp10 = tmp6 + tmp9
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
    _tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 49
        r2 = (rindex // 49)
        tmp11 = tl.load(in_ptr0 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp12 = tmp11.to(tl.float32)
        tmp13 = tmp12 - tmp4
        tmp14 = tmp13 * tmp13
        _tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
    tmp15 = tl.sum(_tmp15, 1)[:, None]
    tl.store(out_ptr1 + x0, tmp15, xmask)
    tmp25 = tl.load(in_ptr2 + (x0), xmask)
    tmp16 = 6272.0
    tmp17 = tmp15 / tmp16
    tmp18 = 1e-05
    tmp19 = tmp17 + tmp18
    tmp20 = tl.math.rsqrt(tmp19)
    tmp21 = 1.0001594642002871
    tmp22 = tmp17 * tmp21
    tmp23 = 0.1
    tmp24 = tmp22 * tmp23
    tmp26 = 0.9
    tmp27 = tmp25 * tmp26
    tmp28 = tmp24 + tmp27
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
    tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)


def get_args():
    arg_0 = rand_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_199.run(*args, 264, 6272, grid=grid(264), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_199.benchmark_all_configs(*args, 264, 6272, grid=grid(264))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/rz/crzhphlanp4xljhlyhd5oa64x25m2selaazalafwkl2z37flecta.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_247, add_250, convert_element_type_326, convert_element_type_327, mul_381, mul_387, rsqrt_47, sub_47, var_mean_47
triton_poi_fused__native_batch_norm_legit_functional_200 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_200(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1655808
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 49) % 264
    tmp0 = tl.load(in_ptr0 + (x3), xmask).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x1), xmask)
    tmp4 = tl.load(in_ptr2 + (x1), xmask)
    tmp11 = tl.load(in_ptr3 + (x1), xmask)
    tmp13 = tl.load(in_ptr4 + (x1), xmask)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 6272.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, xmask)


def get_args():
    arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_200.run(*args, 1655808, grid=grid(1655808), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_200.benchmark_all_configs(*args, 1655808, grid=grid(1655808))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/7o/c7ocvzt6vmjqamqpyhoskmyrik2ig24uhlssffiqbirrszrdop2r.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_328
triton_poi_fused__to_copy_201 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[524288], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_201(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 418176
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((1584, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((1584, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_201.run(*args, 418176, grid=grid(418176), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_201.benchmark_all_configs(*args, 418176, grid=grid(418176))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/uo/cuocaty2j6iusx5tak7blhjvcj7z7suv537pcyhwgk3qfuh7te7r.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_252, add_253, add_254, convert_element_type_329, mul_389, mul_390, mul_391, mul_392, mul_393, rsqrt_48, squeeze_145, var_mean_48
triton_red_fused__native_batch_norm_legit_functional_202 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[2048, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_202(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 1584
    rnumel = 6272
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 49
        r2 = (rindex // 49)
        tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
    tmp2 = tl.sum(_tmp2, 1)[:, None]
    tmp7 = tl.load(in_ptr1 + (x0), xmask)
    tmp3 = 6272.0
    tmp4 = tmp2 / tmp3
    tmp5 = 0.1
    tmp6 = tmp4 * tmp5
    tmp8 = 0.9
    tmp9 = tmp7 * tmp8
    tmp10 = tmp6 + tmp9
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
    _tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 49
        r2 = (rindex // 49)
        tmp11 = tl.load(in_ptr0 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp12 = tmp11.to(tl.float32)
        tmp13 = tmp12 - tmp4
        tmp14 = tmp13 * tmp13
        _tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
    tmp15 = tl.sum(_tmp15, 1)[:, None]
    tl.store(out_ptr1 + x0, tmp15, xmask)
    tmp25 = tl.load(in_ptr2 + (x0), xmask)
    tmp16 = 6272.0
    tmp17 = tmp15 / tmp16
    tmp18 = 1e-05
    tmp19 = tmp17 + tmp18
    tmp20 = tl.math.rsqrt(tmp19)
    tmp21 = 1.0001594642002871
    tmp22 = tmp17 * tmp21
    tmp23 = 0.1
    tmp24 = tmp22 * tmp23
    tmp26 = 0.9
    tmp27 = tmp25 * tmp26
    tmp28 = tmp24 + tmp27
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
    tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)


def get_args():
    arg_0 = rand_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_202.run(*args, 1584, 6272, grid=grid(1584), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_202.benchmark_all_configs(*args, 1584, 6272, grid=grid(1584))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/d2/cd2goj7lmhtwh5scpm6a2hg5fjalovx2pxu4k6raurql7xnwmae5.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub

# aten._native_batch_norm_legit_functional => add_252, add_255, convert_element_type_329, convert_element_type_330, mul_388, mul_394, rsqrt_48, sub_48, var_mean_48
# aten.add => add_317
# aten.clone => clone_39
# aten.fill => full_like_8
# aten.mul => mul_587, mul_588
# aten.sigmoid => sigmoid_72
# aten.sub => sub_105
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_203 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_203(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
    xnumel = 9934848
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 49) % 1584
    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x1), None)
    tmp4 = tl.load(in_ptr2 + (x1), None)
    tmp11 = tl.load(in_ptr3 + (x1), None)
    tmp13 = tl.load(in_ptr4 + (x1), None)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 6272.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tmp16 = tl.sigmoid(tmp15)
    tmp17 = 1.0
    tmp18 = tmp17 - tmp16
    tmp19 = tmp15 * tmp18
    tmp20 = tmp19 + tmp17
    tmp21 = tmp16 * tmp20
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None)
    tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp21, None)


def get_args():
    arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_6 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_203.run(*args, 9934848, grid=grid(9934848), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_203.benchmark_all_configs(*args, 9934848, grid=grid(9934848))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/4x/c4xvll4q3rptmkfezospulueatrj4cvvts5fa6rf327imnsbrund.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_333
triton_poi_fused__to_copy_204 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_204(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 3564
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((396, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((396, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_204.run(*args, 3564, grid=grid(3564), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_204.benchmark_all_configs(*args, 3564, grid=grid(3564))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/oo/coohdf37vqteghs4tdccuqlheosk6dnovjd7xhliv6jnuxrt5q6k.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_356
triton_poi_fused_split_with_sizes_205 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_205(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2483712
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 19404
    x1 = (xindex // 19404)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (77616*x1)), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, xmask)


def get_args():
    arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_205.run(*args, 2483712, grid=grid(2483712), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_205.benchmark_all_configs(*args, 2483712, grid=grid(2483712))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/q2/cq2qbiau5hauewftu26n75rmwbfaqfklvxrkizh35mvxm2pcsrku.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_334
triton_poi_fused__to_copy_206 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_206(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 9900
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((396, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((396, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_206.run(*args, 9900, grid=grid(9900), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_206.benchmark_all_configs(*args, 9900, grid=grid(9900))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/zt/cztezqwtd34ik4tsx6erhha7cnfeljbbbwidrvnml2yi26gx6sbr.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_361
triton_poi_fused_split_with_sizes_207 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_207(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2483712
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 19404
    x1 = (xindex // 19404)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (19404 + x0 + (77616*x1)), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, xmask)


def get_args():
    arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_207.run(*args, 2483712, grid=grid(2483712), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_207.benchmark_all_configs(*args, 2483712, grid=grid(2483712))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/a5/ca5stuyejdo4icolodntofflni5tgdpo5ewbossduq7phpjybihc.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_335
triton_poi_fused__to_copy_208 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_208(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 19404
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((396, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((396, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_208.run(*args, 19404, grid=grid(19404), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_208.benchmark_all_configs(*args, 19404, grid=grid(19404))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/rz/crzmv3373ssmmao7rhii34dz6qyf2kesyjmask7sgbcayzblrpb7.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_366
triton_poi_fused_split_with_sizes_209 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_209(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2483712
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 19404
    x1 = (xindex // 19404)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (38808 + x0 + (77616*x1)), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, xmask)


def get_args():
    arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_209.run(*args, 2483712, grid=grid(2483712), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_209.benchmark_all_configs(*args, 2483712, grid=grid(2483712))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ua/cuahjxhhqgvs2pfawkys4ycn4cv26634v6swah3d6itp3ay5rix6.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_336
triton_poi_fused__to_copy_210 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_210(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 32076
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((396, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((396, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_210.run(*args, 32076, grid=grid(32076), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_210.benchmark_all_configs(*args, 32076, grid=grid(32076))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/v7/cv7bjv2cw2xs5mbq2x7bddhtciqwe66f4efi6r3hekyvffle4el4.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_371
triton_poi_fused_split_with_sizes_211 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_211(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2483712
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 19404
    x1 = (xindex // 19404)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (58212 + x0 + (77616*x1)), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, xmask)


def get_args():
    arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_211.run(*args, 2483712, grid=grid(2483712), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_211.benchmark_all_configs(*args, 2483712, grid=grid(2483712))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/n2/cn2whdfczqvp7tj52zmht7aewohn4yzq7pmnqwutp7hyu2nidelg.py
# Original ATen: aten.cat

# aten.cat => cat_35
triton_poi_fused_cat_212 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_212(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2483712
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 19404
    x1 = (xindex // 19404)
    tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (77616*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask)


def get_args():
    arg_0 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 396, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_212.run(*args, 2483712, grid=grid(2483712), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_212.benchmark_all_configs(*args, 2483712, grid=grid(2483712))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/rs/crsyf734jiflnmzza27lk4h5tk3zflzcv6g64yl5p4sitpmqg5lb.py
# Original ATen: aten.cat

# aten.cat => cat_35
triton_poi_fused_cat_213 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_213(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 2483712
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 19404
    x1 = (xindex // 19404)
    tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (77616*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask)


def get_args():
    arg_0 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 396, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_213.run(*args, 2483712, grid=grid(2483712), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_213.benchmark_all_configs(*args, 2483712, grid=grid(2483712))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/5w/c5w3sqhxbgvomykhq2gsbk2tldu7xewurtz7sp2eii33b3skhwth.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu

# aten._native_batch_norm_legit_functional => add_257, add_260, convert_element_type_337, convert_element_type_338, mul_396, mul_402, rsqrt_49, sub_49, var_mean_49
# aten.mean => mean_13
# aten.silu => convert_element_type_339, convert_element_type_340, mul_403, sigmoid_53
triton_per_fused__native_batch_norm_legit_functional_mean_silu_214 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[262144, 64],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_mean_silu_214(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 202752
    rnumel = 49
    RBLOCK: tl.constexpr = 64
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r2 = rindex
    x3 = xindex
    x0 = xindex % 1584
    tmp0 = tl.load(in_ptr0 + (r2 + (49*x3)), rmask, other=0).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x0), None)
    tmp4 = tl.load(in_ptr2 + (x0), None)
    tmp11 = tl.load(in_ptr3 + (x0), None)
    tmp13 = tl.load(in_ptr4 + (x0), None)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 6272.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tmp16 = tmp15.to(tl.float32)
    tmp17 = tl.sigmoid(tmp16)
    tmp18 = tmp16 * tmp17
    tmp19 = tmp18.to(tl.float32)
    tmp20 = tmp19.to(tl.float32)
    tmp22 = tl.where(rmask, tmp20, 0)
    tmp23 = tl.sum(tmp22, 1)[:, None]
    tmp24 = 49.0
    tmp25 = tmp23 / tmp24
    tmp26 = tmp25.to(tl.float32)
    tl.store(out_ptr0 + (r2 + (49*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask)
    tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None)


def get_args():
    arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_6 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_mean_silu_214.run(*args, 202752, 49, grid=grid(202752), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_mean_silu_214.benchmark_all_configs(*args, 202752, 49, grid=grid(202752))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/73/c73f3yeyp4owv2uj77f6pdjw5heox2q6yo4kfkrqenvhw4yqhomk.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_342
triton_poi_fused__to_copy_215 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_215(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 209088
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((132, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((132, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_215.run(*args, 209088, grid=grid(209088), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_215.benchmark_all_configs(*args, 209088, grid=grid(209088))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ca/ccagnt2eck6mlzrforr72k455qyt2o7bmlcxrdko6fbwlxsyixff.py
# Original ATen: aten._to_copy, aten.convolution

# aten._to_copy => convert_element_type_341
# aten.convolution => convolution_132
triton_poi_fused__to_copy_convolution_216 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[256], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_216(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 132
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((132,), (1,), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((132,), (1,), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_convolution_216.run(*args, 132, grid=grid(132), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_convolution_216.benchmark_all_configs(*args, 132, grid=grid(132))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/es/ces42l67kcrpgvq2z54f4gf5my7z5wnxr2mun5sm744xvtl7qmr3.py
# Original ATen: aten._to_copy, aten.convolution, aten.silu

# aten._to_copy => convert_element_type_341
# aten.convolution => convolution_132
# aten.silu => convert_element_type_343, convert_element_type_344, mul_404, sigmoid_54
triton_poi_fused__to_copy_convolution_silu_217 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_silu_217(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 16896
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 132
    tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32)
    tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
    tmp2 = tmp0 + tmp1
    tmp3 = tmp2.to(tl.float32)
    tmp4 = tl.sigmoid(tmp3)
    tmp5 = tmp3 * tmp4
    tmp6 = tmp5.to(tl.float32)
    tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, xmask)


def get_args():
    arg_0 = rand_strided((128, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((132,), (1,), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_convolution_silu_217.run(*args, 16896, grid=grid(16896), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_convolution_silu_217.benchmark_all_configs(*args, 16896, grid=grid(16896))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/xv/cxvmi6ffjgu4jnqxjbleamncx3jw3a5x6bteybr2qp7fyy4to7ad.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_346
triton_poi_fused__to_copy_218 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_218(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 209088
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((1584, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((1584, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_218.run(*args, 209088, grid=grid(209088), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_218.benchmark_all_configs(*args, 209088, grid=grid(209088))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/xn/cxnwubuxft4fv4egkcmiwmgz7afk4w2atiiqg6oyapkchhvtktlp.py
# Original ATen: aten._to_copy, aten.convolution

# aten._to_copy => convert_element_type_345
# aten.convolution => convolution_133
triton_poi_fused__to_copy_convolution_219 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_219(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1584
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_convolution_219.run(*args, 1584, grid=grid(1584), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_convolution_219.benchmark_all_configs(*args, 1584, grid=grid(1584))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/zi/cziku3wkydhqpprjhx3spm364d6fncgicrpenyhycjc6uhdvng7k.py
# Original ATen: aten._to_copy, aten.convolution

# aten._to_copy => convert_element_type_345
# aten.convolution => convolution_133
triton_poi_fused__to_copy_convolution_220 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_220(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 202752
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 1584
    tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)
    tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
    tmp2 = tmp0 + tmp1
    tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None)


def get_args():
    arg_0 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_convolution_220.run(*args, 202752, grid=grid(202752), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_convolution_220.benchmark_all_configs(*args, 202752, grid=grid(202752))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/qo/cqo4la4ss4kx6zb5vv53wk2l4cvgi273m7sogbmwaw3v62ktkznm.py
# Original ATen: aten.mul, aten.sigmoid, aten.silu

# aten.mul => mul_405
# aten.sigmoid => sigmoid_55
# aten.silu => convert_element_type_339, convert_element_type_340, mul_403, sigmoid_53
triton_poi_fused_mul_sigmoid_silu_221 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused_mul_sigmoid_silu_221(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 9934848
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x1 = (xindex // 49)
    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
    tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp2 = tl.sigmoid(tmp1)
    tmp3 = tmp1 * tmp2
    tmp4 = tmp3.to(tl.float32)
    tmp6 = tl.sigmoid(tmp5)
    tmp7 = tmp4 * tmp6
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None)


def get_args():
    arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_mul_sigmoid_silu_221.run(*args, 9934848, grid=grid(9934848), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_mul_sigmoid_silu_221.benchmark_all_configs(*args, 9934848, grid=grid(9934848))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/g3/cg37tw75k3qhu23v54dt4szux7ntig2ituyeprsnltkm3nfiarqp.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_374
triton_poi_fused_split_with_sizes_222 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_222(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 4967424
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 38808
    x1 = (xindex // 38808)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + (77616*x1)), xmask).to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask)


def get_args():
    arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_222.run(*args, 4967424, grid=grid(4967424), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_222.benchmark_all_configs(*args, 4967424, grid=grid(4967424))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/qk/cqk2scd5kzvelb4wc76gqtuwya5pxfoqhhpq26ga2dggkjyf6t72.py
# Original ATen: aten.split_with_sizes

# aten.split_with_sizes => getitem_375
triton_poi_fused_split_with_sizes_223 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_223(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 4967424
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex % 38808
    x1 = (xindex // 38808)
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (38808 + x0 + (77616*x1)), xmask).to(tl.float32)
    tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask)


def get_args():
    arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_split_with_sizes_223.run(*args, 4967424, grid=grid(4967424), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_split_with_sizes_223.benchmark_all_configs(*args, 4967424, grid=grid(4967424))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/5z/c5zsdbhynt6xdafkaydaa3qemqk6v4xjjheiwafzrfldvlj3ihqh.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_347
triton_poi_fused__to_copy_224 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_224(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 104544
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_224.run(*args, 104544, grid=grid(104544), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_224.benchmark_all_configs(*args, 104544, grid=grid(104544))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ca/ccakgu22ynocwwbf5iwb4um7n6tcg3nlbhmehr4k36wdy4nrskss.py
# Original ATen: aten.cat

# aten.cat => cat_36
triton_poi_fused_cat_225 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[1048576], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_225(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 827904
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 6468
    x1 = (xindex // 6468)
    tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (12936*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask)


def get_args():
    arg_0 = rand_strided((128, 132, 7, 7), (6468, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 132, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_225.run(*args, 827904, grid=grid(827904), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_225.benchmark_all_configs(*args, 827904, grid=grid(827904))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/pp/cpprz7zopeigvjazcl7gems4pi2vh33qfi655qdlyuddxr7rvwr3.py
# Original ATen: aten.cat

# aten.cat => cat_36
triton_poi_fused_cat_226 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[1048576], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_226(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 827904
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x2 = xindex
    x0 = xindex % 6468
    x1 = (xindex // 6468)
    tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32)
    tl.store(out_ptr0 + (x0 + (12936*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask)


def get_args():
    arg_0 = rand_strided((128, 132, 7, 7), (6468, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((128, 132, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_cat_226.run(*args, 827904, grid=grid(827904), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_cat_226.benchmark_all_configs(*args, 827904, grid=grid(827904))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/ws/cwsi7gb6vfnxgobitrcoooobibx3jugdjfpqbmhity4caskyzyyn.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add

# aten._native_batch_norm_legit_functional => add_262, add_265, convert_element_type_349, convert_element_type_350, mul_406, mul_412, rsqrt_50, sub_50, var_mean_50
# aten.add => add_266
triton_poi_fused__native_batch_norm_legit_functional_add_227 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_227(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1655808
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x3 = xindex
    x1 = (xindex // 49) % 264
    tmp0 = tl.load(in_ptr0 + (x3), xmask).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x1), xmask)
    tmp4 = tl.load(in_ptr2 + (x1), xmask)
    tmp11 = tl.load(in_ptr3 + (x1), xmask)
    tmp13 = tl.load(in_ptr4 + (x1), xmask)
    tmp16 = tl.load(in_ptr5 + (x3), xmask).to(tl.float32)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 6272.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tmp17 = tmp15 + tmp16
    tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp17, xmask)


def get_args():
    arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_6 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__native_batch_norm_legit_functional_add_227.run(*args, 1655808, grid=grid(1655808), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__native_batch_norm_legit_functional_add_227.benchmark_all_configs(*args, 1655808, grid=grid(1655808))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/7p/c7pjks5p4ph6nsw372btix32wdljqepkkdc72dgxj6ycvs5rdkcd.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_397
triton_poi_fused__to_copy_228 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[524288], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_228(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 405504
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), None)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None)


def get_args():
    arg_0 = rand_strided((1536, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((1536, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_228.run(*args, 405504, grid=grid(405504), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_228.benchmark_all_configs(*args, 405504, grid=grid(405504))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/om/com4odhqbzh6wdohnvxyqvr7ulmhf2d2ytjjg22jyjf67zj3ocbo.py
# Original ATen: aten._native_batch_norm_legit_functional

# aten._native_batch_norm_legit_functional => add_300, add_301, add_302, convert_element_type_398, mul_464, mul_465, mul_466, mul_467, mul_468, rsqrt_57, squeeze_172, var_mean_57
triton_red_fused__native_batch_norm_legit_functional_229 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@reduction(
    size_hints=[2048, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_229(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 1536
    rnumel = 6272
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 49
        r2 = (rindex // 49)
        tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (75264*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        _tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
    tmp2 = tl.sum(_tmp2, 1)[:, None]
    tmp7 = tl.load(in_ptr1 + (x0), xmask)
    tmp3 = 6272.0
    tmp4 = tmp2 / tmp3
    tmp5 = 0.1
    tmp6 = tmp4 * tmp5
    tmp8 = 0.9
    tmp9 = tmp7 * tmp8
    tmp10 = tmp6 + tmp9
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
    _tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex % 49
        r2 = (rindex // 49)
        tmp11 = tl.load(in_ptr0 + (r1 + (49*x0) + (75264*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
        tmp12 = tmp11.to(tl.float32)
        tmp13 = tmp12 - tmp4
        tmp14 = tmp13 * tmp13
        _tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
    tmp15 = tl.sum(_tmp15, 1)[:, None]
    tl.store(out_ptr1 + x0, tmp15, xmask)
    tmp25 = tl.load(in_ptr2 + (x0), xmask)
    tmp16 = 6272.0
    tmp17 = tmp15 / tmp16
    tmp18 = 1e-05
    tmp19 = tmp17 + tmp18
    tmp20 = tl.math.rsqrt(tmp19)
    tmp21 = 1.0001594642002871
    tmp22 = tmp17 * tmp21
    tmp23 = 0.1
    tmp24 = tmp22 * tmp23
    tmp26 = 0.9
    tmp27 = tmp25 * tmp26
    tmp28 = tmp24 + tmp27
    tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
    tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)


def get_args():
    arg_0 = rand_strided((1, 1536, 1, 1), (1536, 1, 1536, 1536), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((1, 1536, 1, 1), (1536, 1, 1536, 1536), device='cuda:0', dtype=torch.float32)
    arg_6 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
    arg_7 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_red_fused__native_batch_norm_legit_functional_229.run(*args, 1536, 6272, grid=grid(1536), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_red_fused__native_batch_norm_legit_functional_229.benchmark_all_configs(*args, 1536, 6272, grid=grid(1536))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/wa/cwaeb6f5uogsexjap772fuwlvl2vebz5e54hbg5l3px6d7azdjsm.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.relu, aten.threshold_backward, aten.view

# aten._native_batch_norm_legit_functional => add_300, add_303, convert_element_type_398, convert_element_type_399, mul_463, mul_469, rsqrt_57, sub_57, var_mean_57
# aten.mean => mean_16
# aten.relu => relu_6
# aten.threshold_backward => le
# aten.view => view
triton_per_fused__native_batch_norm_legit_functional_mean_relu_threshold_backward_view_230 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@persistent_reduction(
    size_hints=[262144, 64],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*i1', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_mean_relu_threshold_backward_view_230(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr1, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr):
    xnumel = 196608
    rnumel = 49
    RBLOCK: tl.constexpr = 64
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r2 = rindex
    x3 = xindex
    x0 = xindex % 1536
    tmp0 = tl.load(in_ptr0 + (r2 + (49*x3)), rmask, other=0).to(tl.float32)
    tmp2 = tl.load(in_ptr1 + (x0), None)
    tmp4 = tl.load(in_ptr2 + (x0), None)
    tmp11 = tl.load(in_ptr3 + (x0), None)
    tmp13 = tl.load(in_ptr4 + (x0), None)
    tmp1 = tmp0.to(tl.float32)
    tmp3 = tmp1 - tmp2
    tmp5 = 6272.0
    tmp6 = tmp4 / tmp5
    tmp7 = 1e-05
    tmp8 = tmp6 + tmp7
    tmp9 = tl.math.rsqrt(tmp8)
    tmp10 = tmp3 * tmp9
    tmp12 = tmp10 * tmp11
    tmp14 = tmp12 + tmp13
    tmp15 = tmp14.to(tl.float32)
    tmp16 = tl.where(0 != 0, 0, tl.where(0 > tmp15, 0, tmp15))
    tmp17 = 0.0
    tmp18 = tmp16 <= tmp17
    tmp19 = tmp16.to(tl.float32)
    tmp21 = tl.where(rmask, tmp19, 0)
    tmp22 = tl.sum(tmp21, 1)[:, None]
    tmp23 = 49.0
    tmp24 = tmp22 / tmp23
    tmp25 = tmp24.to(tl.float32)
    tl.store(out_ptr1 + (r2 + (49*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp18, rmask)
    tl.store(out_ptr3 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp25, None)


def get_args():
    arg_0 = rand_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda:0', dtype=torch.float16)
    arg_1 = rand_strided((1, 1536, 1, 1), (1536, 1, 1536, 1536), device='cuda:0', dtype=torch.float32)
    arg_2 = rand_strided((1, 1536, 1, 1), (1536, 1, 1536, 1536), device='cuda:0', dtype=torch.float32)
    arg_3 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
    arg_4 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
    arg_5 = rand_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda:0', dtype=torch.bool)
    arg_6 = rand_strided((128, 1536), (1536, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_per_fused__native_batch_norm_legit_functional_mean_relu_threshold_backward_view_230.run(*args, 196608, 49, grid=grid(196608), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_per_fused__native_batch_norm_legit_functional_mean_relu_threshold_backward_view_230.benchmark_all_configs(*args, 196608, 49, grid=grid(196608))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/hq/chqge2y77s52vahqshrpnxrmtqywhsxmxfiwmel7jsbqfdldligh.py
# Original ATen: aten._to_copy, aten.t

# aten._to_copy => convert_element_type_401
# aten.t => permute_1
triton_poi_fused__to_copy_t_231 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_t_231(in_ptr0, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1536000
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), None)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None)
    tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None)


def get_args():
    arg_0 = rand_strided((1000, 1536), (1536, 1), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((1000, 1536), (1536, 1), device='cuda:0', dtype=torch.float16)
    arg_2 = rand_strided((1000, 1536), (1536, 1), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1, arg_2,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_t_231.run(*args, 1536000, grid=grid(1536000), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_t_231.benchmark_all_configs(*args, 1536000, grid=grid(1536000))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/kk/ckkhmi7qwvpeybxqkogp5ah6kylzhjoxai5hqgfzedjta2pj3hyg.py
# Original ATen: aten._to_copy

# aten._to_copy => convert_element_type_400
triton_poi_fused__to_copy_232 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_232(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1000
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)


def get_args():
    arg_0 = rand_strided((1000,), (1,), device='cuda:0', dtype=torch.float32)
    arg_1 = rand_strided((1000,), (1,), device='cuda:0', dtype=torch.float16)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_232.run(*args, 1000, grid=grid(1000), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused__to_copy_232.benchmark_all_configs(*args, 1000, grid=grid(1000))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


# kernel path: /tmp/torchinductor_shunting/gb/cgbszhl6qhqzdxycxrv6h7svhilojvsjjhnimvf6vwfj77spvald.py
# Original ATen: aten.add

# aten.add => add
triton_poi_fused_add_233 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor

from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid

@pointwise(size_hints=[1], filename=__file__, meta={'signature': {0: '*i64', 1: '*i64', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused_add_233(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 1
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    tmp0 = tl.load(in_ptr0 + (0))
    tmp1 = tl.broadcast_to(tmp0, [XBLOCK])
    tmp2 = 1
    tmp3 = tmp1 + tmp2
    tl.store(out_ptr0 + (0 + tl.zeros([XBLOCK], tl.int32)), tmp3, None)


def get_args():
    arg_0 = rand_strided((), (), device='cuda:0', dtype=torch.int64)
    arg_1 = rand_strided((), (), device='cuda:0', dtype=torch.int64)
    return arg_0, arg_1,


def call(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        stream0 = get_cuda_stream(0)
        triton_poi_fused_add_233.run(*args, 1, grid=grid(1), stream=stream0)


def benchmark_all_configs(args):
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        return triton_poi_fused_add_233.benchmark_all_configs(*args, 1, grid=grid(1))


if __name__ == '__main__':
    from torch._inductor.utils import get_num_bytes
    from triton.testing import do_bench

    args = get_args()
    ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
    num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
    gb_per_s = num_gb / (ms / 1e3)
    print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")
''')


async_compile.wait(globals())
del async_compile

def call(args):
    primals_1, primals_2, primals_3, primals_4, primals_5, primals_6, primals_7, primals_8, primals_9, primals_10, primals_11, primals_12, primals_13, primals_14, primals_15, primals_16, primals_17, primals_18, primals_19, primals_20, primals_21, primals_22, primals_23, primals_24, primals_25, primals_26, primals_27, primals_28, primals_29, primals_30, primals_31, primals_32, primals_33, primals_34, primals_35, primals_36, primals_37, primals_38, primals_39, primals_40, primals_41, primals_42, primals_43, primals_44, primals_45, primals_46, primals_47, primals_48, primals_49, primals_50, primals_51, primals_52, primals_53, primals_54, primals_55, primals_56, primals_57, primals_58, primals_59, primals_60, primals_61, primals_62, primals_63, primals_64, primals_65, primals_66, primals_67, primals_68, primals_69, primals_70, primals_71, primals_72, primals_73, primals_74, primals_75, primals_76, primals_77, primals_78, primals_79, primals_80, primals_81, primals_82, primals_83, primals_84, primals_85, primals_86, primals_87, primals_88, primals_89, primals_90, primals_91, primals_92, primals_93, primals_94, primals_95, primals_96, primals_97, primals_98, primals_99, primals_100, primals_101, primals_102, primals_103, primals_104, primals_105, primals_106, primals_107, primals_108, primals_109, primals_110, primals_111, primals_112, primals_113, primals_114, primals_115, primals_116, primals_117, primals_118, primals_119, primals_120, primals_121, primals_122, primals_123, primals_124, primals_125, primals_126, primals_127, primals_128, primals_129, primals_130, primals_131, primals_132, primals_133, primals_134, primals_135, primals_136, primals_137, primals_138, primals_139, primals_140, primals_141, primals_142, primals_143, primals_144, primals_145, primals_146, primals_147, primals_148, primals_149, primals_150, primals_151, primals_152, primals_153, primals_154, primals_155, primals_156, primals_157, primals_158, primals_159, primals_160, primals_161, primals_162, primals_163, primals_164, primals_165, primals_166, primals_167, primals_168, primals_169, primals_170, primals_171, primals_172, primals_173, primals_174, primals_175, primals_176, primals_177, primals_178, primals_179, primals_180, primals_181, primals_182, primals_183, primals_184, primals_185, primals_186, primals_187, primals_188, primals_189, primals_190, primals_191, primals_192, primals_193, primals_194, primals_195, primals_196, primals_197, primals_198, primals_199, primals_200, primals_201, primals_202, primals_203, primals_204, primals_205, primals_206, primals_207, primals_208, primals_209, primals_210, primals_211, primals_212, primals_213, primals_214, primals_215, primals_216, primals_217, primals_218, primals_219, primals_220, primals_221, primals_222, primals_223, primals_224, primals_225, primals_226, primals_227, primals_228, primals_229, primals_230, primals_231, primals_232, primals_233, primals_234, primals_235, primals_236, primals_237, primals_238, primals_239, primals_240, primals_241, primals_242, primals_243, primals_244, primals_245, primals_246, primals_247, primals_248, primals_249, primals_250, primals_251, primals_252, primals_253, primals_254, primals_255, primals_256, primals_257, primals_258, primals_259, primals_260, primals_261, primals_262, primals_263, primals_264, primals_265, primals_266, primals_267, primals_268, primals_269, primals_270, primals_271, primals_272, primals_273, primals_274, primals_275, primals_276, primals_277, primals_278, primals_279, primals_280, primals_281, primals_282, primals_283, primals_284, primals_285, primals_286, primals_287, primals_288, primals_289, primals_290, primals_291, primals_292, primals_293, primals_294, primals_295, primals_296, primals_297, primals_298, primals_299, primals_300, primals_301, primals_302, primals_303, primals_304, primals_305, primals_306, primals_307, primals_308, primals_309, primals_310, primals_311, primals_312, primals_313, primals_314, primals_315, primals_316, primals_317, primals_318, primals_319, primals_320, primals_321, primals_322, primals_323, primals_324, primals_325, primals_326, primals_327, primals_328, primals_329, primals_330, primals_331, primals_332, primals_333, primals_334, primals_335, primals_336, primals_337, primals_338, primals_339, primals_340, primals_341, primals_342, primals_343, primals_344, primals_345, primals_346, primals_347, primals_348, primals_349, primals_350, primals_351, primals_352, primals_353, primals_354, primals_355, primals_356, primals_357, primals_358, primals_359, primals_360, primals_361, primals_362, primals_363, primals_364, primals_365, primals_366, primals_367, primals_368, primals_369, primals_370, primals_371, primals_372, primals_373, primals_374, primals_375, primals_376, primals_377, primals_378, primals_379, primals_380, primals_381, primals_382, primals_383, primals_384, primals_385, primals_386, primals_387, primals_388, primals_389, primals_390, primals_391, primals_392, primals_393, primals_394, primals_395, primals_396, primals_397, primals_398, primals_399, primals_400, primals_401, primals_402, primals_403, primals_404, primals_405, primals_406, primals_407, primals_408, primals_409, primals_410, primals_411, primals_412, primals_413, primals_414, primals_415, primals_416, primals_417, primals_418, primals_419, primals_420, primals_421, primals_422, primals_423, primals_424, primals_425, primals_426, primals_427, primals_428, primals_429, primals_430, primals_431, primals_432, primals_433, primals_434, primals_435, primals_436, primals_437, primals_438, primals_439, primals_440, primals_441, primals_442, primals_443, primals_444, primals_445, primals_446, primals_447, primals_448, primals_449, primals_450, primals_451, primals_452, primals_453, primals_454, primals_455, primals_456, primals_457, primals_458, primals_459, primals_460, primals_461, primals_462, primals_463, primals_464, primals_465, primals_466, primals_467, primals_468, primals_469, primals_470, primals_471, primals_472, primals_473, primals_474, primals_475, primals_476, primals_477, primals_478, primals_479, primals_480 = args
    args.clear()
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0) # no-op to ensure context
        buf0 = empty_strided((32, 3, 3, 3), (27, 9, 3, 1), device='cuda', dtype=torch.float16)
        stream0 = get_cuda_stream(0)
        triton_poi_fused__to_copy_0.run(primals_117, buf0, 864, grid=grid(864), stream=stream0)
        del primals_117
        buf1 = empty_strided((128, 3, 224, 224), (150528, 50176, 224, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_1.run(primals_480, buf1, 19267584, grid=grid(19267584), stream=stream0)
        del primals_480
        buf2 = extern_kernels.convolution(buf1, buf0, stride=(2, 2), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf2, (128, 32, 112, 112), (401408, 12544, 112, 1))
        buf3 = empty_strided((1, 32, 1, 1, 14), (448, 1, 448, 448, 32), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_2.run(buf2, buf3, 448, 114688, grid=grid(448), stream=stream0)
        buf4 = empty_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda', dtype=torch.float32)
        buf5 = buf4; del buf4  # reuse
        buf9 = empty_strided((32, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_3.run(buf5, buf3, primals_307, buf9, 32, 14, grid=grid(32), stream=stream0)
        del primals_307
        buf6 = buf3; del buf3  # reuse
        triton_red_fused__native_batch_norm_legit_functional_4.run(buf2, buf5, buf6, 448, 114688, grid=grid(448), stream=stream0)
        buf7 = empty_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda', dtype=torch.float32)
        buf8 = empty_strided((32, ), (1, ), device='cuda', dtype=torch.float32)
        buf10 = empty_strided((32, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_5.run(buf6, primals_308, buf7, buf8, buf10, 32, 14, grid=grid(32), stream=stream0)
        del primals_308
        buf11 = empty_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_relu_6.run(buf2, buf5, buf7, primals_1, primals_2, buf11, 51380224, grid=grid(51380224), stream=stream0)
        del primals_2
        buf12 = empty_strided((32, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_7.run(primals_118, buf12, 288, grid=grid(288), stream=stream0)
        del primals_118
        buf13 = extern_kernels.convolution(buf11, buf12, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=32, bias=None)
        assert_size_stride(buf13, (128, 32, 112, 112), (401408, 12544, 112, 1))
        buf14 = buf6; del buf6  # reuse
        triton_red_fused__native_batch_norm_legit_functional_2.run(buf13, buf14, 448, 114688, grid=grid(448), stream=stream0)
        buf15 = buf7; del buf7  # reuse
        buf16 = buf15; del buf15  # reuse
        buf20 = empty_strided((32, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_3.run(buf16, buf14, primals_310, buf20, 32, 14, grid=grid(32), stream=stream0)
        del primals_310
        buf17 = buf14; del buf14  # reuse
        triton_red_fused__native_batch_norm_legit_functional_4.run(buf13, buf16, buf17, 448, 114688, grid=grid(448), stream=stream0)
        buf18 = empty_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda', dtype=torch.float32)
        buf19 = empty_strided((32, ), (1, ), device='cuda', dtype=torch.float32)
        buf21 = empty_strided((32, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_5.run(buf17, primals_311, buf18, buf19, buf21, 32, 14, grid=grid(32), stream=stream0)
        del primals_311
        buf22 = empty_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_relu_6.run(buf13, buf16, buf18, primals_3, primals_4, buf22, 51380224, grid=grid(51380224), stream=stream0)
        del primals_4
        buf23 = empty_strided((32, 32, 1, 1), (32, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_8.run(primals_119, buf23, 1024, grid=grid(1024), stream=stream0)
        del primals_119
        buf24 = extern_kernels.convolution(buf22, buf23, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf24, (128, 32, 112, 112), (401408, 12544, 112, 1))
        buf25 = buf17; del buf17  # reuse
        triton_red_fused__native_batch_norm_legit_functional_2.run(buf24, buf25, 448, 114688, grid=grid(448), stream=stream0)
        buf26 = buf18; del buf18  # reuse
        buf27 = buf26; del buf26  # reuse
        buf31 = empty_strided((32, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_3.run(buf27, buf25, primals_313, buf31, 32, 14, grid=grid(32), stream=stream0)
        del primals_313
        buf28 = buf25; del buf25  # reuse
        triton_red_fused__native_batch_norm_legit_functional_4.run(buf24, buf27, buf28, 448, 114688, grid=grid(448), stream=stream0)
        buf29 = empty_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda', dtype=torch.float32)
        buf30 = empty_strided((32, ), (1, ), device='cuda', dtype=torch.float32)
        buf32 = empty_strided((32, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_5.run(buf28, primals_314, buf29, buf30, buf32, 32, 14, grid=grid(32), stream=stream0)
        del buf28
        del primals_314
        buf33 = empty_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_9.run(buf24, buf27, buf29, primals_5, primals_6, buf11, buf33, 51380224, grid=grid(51380224), stream=stream0)
        del buf29
        del primals_6
        buf34 = empty_strided((128, 16, 112, 112), (200704, 12544, 112, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_split_with_sizes_10.run(buf33, buf34, 25690112, grid=grid(25690112), stream=stream0)
        buf35 = empty_strided((128, 16, 112, 112), (200704, 12544, 112, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_split_with_sizes_11.run(buf33, buf35, 25690112, grid=grid(25690112), stream=stream0)
        del buf33
        buf36 = empty_strided((96, 16, 1, 1), (16, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_12.run(primals_120, buf36, 1536, grid=grid(1536), stream=stream0)
        del primals_120
        buf37 = extern_kernels.convolution(buf34, buf36, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf37, (128, 96, 112, 112), (1204224, 12544, 112, 1))
        buf38 = empty_strided((96, 16, 1, 1), (16, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_12.run(primals_121, buf38, 1536, grid=grid(1536), stream=stream0)
        del primals_121
        buf39 = extern_kernels.convolution(buf35, buf38, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf39, (128, 96, 112, 112), (1204224, 12544, 112, 1))
        buf42 = empty_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda', dtype=torch.float16)
        buf40 = as_strided(buf42, (128, 96, 112, 112), (2408448, 12544, 112, 1))  # alias
        triton_poi_fused_cat_13.run(buf37, buf40, 154140672, grid=grid(154140672), stream=stream0)
        del buf37
        buf41 = as_strided(buf42, (128, 96, 112, 112), (2408448, 12544, 112, 1), 1204224)  # alias
        triton_poi_fused_cat_13.run(buf39, buf41, 154140672, grid=grid(154140672), stream=stream0)
        del buf39
        buf43 = empty_strided((1, 192, 1, 1, 13), (2496, 1, 2496, 2496, 192), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_14.run(buf42, buf43, 2496, 123511, grid=grid(2496), stream=stream0)
        buf44 = empty_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda', dtype=torch.float32)
        buf45 = buf44; del buf44  # reuse
        buf49 = empty_strided((192, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_15.run(buf45, buf43, primals_316, buf49, 192, 13, grid=grid(192), stream=stream0)
        del primals_316
        buf46 = buf43; del buf43  # reuse
        triton_red_fused__native_batch_norm_legit_functional_16.run(buf42, buf45, buf46, 2496, 123511, grid=grid(2496), stream=stream0)
        buf47 = empty_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda', dtype=torch.float32)
        buf48 = empty_strided((192, ), (1, ), device='cuda', dtype=torch.float32)
        buf50 = empty_strided((192, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_17.run(buf46, primals_317, buf47, buf48, buf50, 192, 13, grid=grid(192), stream=stream0)
        del buf46
        del primals_317
        buf51 = empty_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda', dtype=torch.float16)
        buf1155 = empty_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda', dtype=torch.bool)
        triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_18.run(buf42, buf45, buf47, primals_7, primals_8, buf51, buf1155, 308281344, grid=grid(308281344), stream=stream0)
        del primals_8
        buf52 = empty_strided((64, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_19.run(primals_122, buf52, 576, grid=grid(576), stream=stream0)
        del primals_122
        buf53 = extern_kernels.convolution(as_strided(buf51, (128, 64, 112, 112), (2408448, 12544, 112, 1)), buf52, stride=(2, 2), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=64, bias=None)
        assert_size_stride(buf53, (128, 64, 56, 56), (200704, 3136, 56, 1))
        buf54 = empty_strided((64, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_20.run(primals_123, buf54, 1600, grid=grid(1600), stream=stream0)
        del primals_123
        buf55 = extern_kernels.convolution(as_strided(buf51, (128, 64, 112, 112), (2408448, 12544, 112, 1), 802816), buf54, stride=(2, 2), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=64, bias=None)
        assert_size_stride(buf55, (128, 64, 56, 56), (200704, 3136, 56, 1))
        buf56 = empty_strided((64, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_21.run(primals_124, buf56, 3136, grid=grid(3136), stream=stream0)
        del primals_124
        buf57 = extern_kernels.convolution(as_strided(buf51, (128, 64, 112, 112), (2408448, 12544, 112, 1), 1605632), buf56, stride=(2, 2), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=64, bias=None)
        assert_size_stride(buf57, (128, 64, 56, 56), (200704, 3136, 56, 1))
        buf61 = empty_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda', dtype=torch.float16)
        buf58 = as_strided(buf61, (128, 64, 56, 56), (602112, 3136, 56, 1))  # alias
        triton_poi_fused_cat_22.run(buf53, buf58, 25690112, grid=grid(25690112), stream=stream0)
        del buf53
        buf59 = as_strided(buf61, (128, 64, 56, 56), (602112, 3136, 56, 1), 200704)  # alias
        triton_poi_fused_cat_22.run(buf55, buf59, 25690112, grid=grid(25690112), stream=stream0)
        del buf55
        buf60 = as_strided(buf61, (128, 64, 56, 56), (602112, 3136, 56, 1), 401408)  # alias
        triton_poi_fused_cat_22.run(buf57, buf60, 25690112, grid=grid(25690112), stream=stream0)
        del buf57
        buf62 = empty_strided((1, 192, 1, 1, 4), (768, 1, 768, 768, 192), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_23.run(buf61, buf62, 768, 100352, grid=grid(768), stream=stream0)
        buf63 = buf47; del buf47  # reuse
        buf64 = buf63; del buf63  # reuse
        buf68 = empty_strided((192, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_24.run(buf64, buf62, primals_319, buf68, 192, 4, grid=grid(192), stream=stream0)
        del primals_319
        buf65 = buf62; del buf62  # reuse
        triton_red_fused__native_batch_norm_legit_functional_25.run(buf61, buf64, buf65, 768, 100352, grid=grid(768), stream=stream0)
        buf66 = empty_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda', dtype=torch.float32)
        buf67 = empty_strided((192, ), (1, ), device='cuda', dtype=torch.float32)
        buf69 = empty_strided((192, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_26.run(buf65, primals_320, buf66, buf67, buf69, 192, 4, grid=grid(192), stream=stream0)
        del buf65
        del primals_320
        buf70 = empty_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda', dtype=torch.float16)
        buf1154 = empty_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda', dtype=torch.bool)
        triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_27.run(buf61, buf64, buf66, primals_9, primals_10, buf70, buf1154, 77070336, grid=grid(77070336), stream=stream0)
        del buf66
        del primals_10
        buf71 = empty_strided((20, 96, 1, 1), (96, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_28.run(primals_125, buf71, 1920, grid=grid(1920), stream=stream0)
        del primals_125
        buf72 = extern_kernels.convolution(as_strided(buf70, (128, 96, 56, 56), (602112, 3136, 56, 1)), buf71, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf72, (128, 20, 56, 56), (62720, 3136, 56, 1))
        buf73 = empty_strided((20, 96, 1, 1), (96, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_28.run(primals_126, buf73, 1920, grid=grid(1920), stream=stream0)
        del primals_126
        buf74 = extern_kernels.convolution(as_strided(buf70, (128, 96, 56, 56), (602112, 3136, 56, 1), 301056), buf73, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf74, (128, 20, 56, 56), (62720, 3136, 56, 1))
        buf77 = empty_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda', dtype=torch.float16)
        buf75 = as_strided(buf77, (128, 20, 56, 56), (125440, 3136, 56, 1))  # alias
        triton_poi_fused_cat_29.run(buf72, buf75, 8028160, grid=grid(8028160), stream=stream0)
        del buf72
        buf76 = as_strided(buf77, (128, 20, 56, 56), (125440, 3136, 56, 1), 62720)  # alias
        triton_poi_fused_cat_29.run(buf74, buf76, 8028160, grid=grid(8028160), stream=stream0)
        del buf74
        buf78 = empty_strided((1, 40, 1, 1, 13), (520, 1, 520, 520, 40), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_30.run(buf77, buf78, 520, 30878, grid=grid(520), stream=stream0)
        buf79 = empty_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda', dtype=torch.float32)
        buf80 = buf79; del buf79  # reuse
        buf84 = empty_strided((40, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_31.run(buf80, buf78, primals_322, buf84, 40, 13, grid=grid(40), stream=stream0)
        del primals_322
        buf81 = buf78; del buf78  # reuse
        triton_red_fused__native_batch_norm_legit_functional_32.run(buf77, buf80, buf81, 520, 30878, grid=grid(520), stream=stream0)
        buf82 = empty_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda', dtype=torch.float32)
        buf83 = empty_strided((40, ), (1, ), device='cuda', dtype=torch.float32)
        buf85 = empty_strided((40, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_33.run(buf81, primals_323, buf82, buf83, buf85, 40, 13, grid=grid(40), stream=stream0)
        del primals_323
        buf86 = empty_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_34.run(buf77, buf80, buf82, primals_11, primals_12, buf86, 16056320, grid=grid(16056320), stream=stream0)
        del primals_12
        buf87 = empty_strided((60, 20, 1, 1), (20, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_35.run(primals_127, buf87, 1200, grid=grid(1200), stream=stream0)
        del primals_127
        buf88 = extern_kernels.convolution(as_strided(buf86, (128, 20, 56, 56), (125440, 3136, 56, 1)), buf87, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf88, (128, 60, 56, 56), (188160, 3136, 56, 1))
        buf89 = empty_strided((60, 20, 1, 1), (20, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_35.run(primals_128, buf89, 1200, grid=grid(1200), stream=stream0)
        del primals_128
        buf90 = extern_kernels.convolution(as_strided(buf86, (128, 20, 56, 56), (125440, 3136, 56, 1), 62720), buf89, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf90, (128, 60, 56, 56), (188160, 3136, 56, 1))
        buf93 = empty_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda', dtype=torch.float16)
        buf91 = as_strided(buf93, (128, 60, 56, 56), (376320, 3136, 56, 1))  # alias
        triton_poi_fused_cat_36.run(buf88, buf91, 24084480, grid=grid(24084480), stream=stream0)
        buf92 = as_strided(buf93, (128, 60, 56, 56), (376320, 3136, 56, 1), 188160)  # alias
        triton_poi_fused_cat_36.run(buf90, buf92, 24084480, grid=grid(24084480), stream=stream0)
        buf94 = empty_strided((1, 120, 1, 1, 4), (480, 1, 480, 480, 120), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_37.run(buf93, buf94, 480, 100352, grid=grid(480), stream=stream0)
        buf95 = empty_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda', dtype=torch.float32)
        buf96 = buf95; del buf95  # reuse
        buf100 = empty_strided((120, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_38.run(buf96, buf94, primals_325, buf100, 120, 4, grid=grid(120), stream=stream0)
        del primals_325
        buf97 = buf94; del buf94  # reuse
        triton_red_fused__native_batch_norm_legit_functional_39.run(buf93, buf96, buf97, 480, 100352, grid=grid(480), stream=stream0)
        buf98 = empty_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda', dtype=torch.float32)
        buf99 = empty_strided((120, ), (1, ), device='cuda', dtype=torch.float32)
        buf101 = empty_strided((120, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_40.run(buf97, primals_326, buf98, buf99, buf101, 120, 4, grid=grid(120), stream=stream0)
        del primals_326
        buf102 = empty_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_relu_41.run(buf93, buf96, buf98, primals_13, primals_14, buf102, 48168960, grid=grid(48168960), stream=stream0)
        del primals_14
        buf103 = empty_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_42.run(primals_129, buf103, 1080, grid=grid(1080), stream=stream0)
        del primals_129
        buf104 = extern_kernels.convolution(buf102, buf103, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None)
        assert_size_stride(buf104, (128, 120, 56, 56), (376320, 3136, 56, 1))
        buf105 = buf97; del buf97  # reuse
        triton_red_fused__native_batch_norm_legit_functional_37.run(buf104, buf105, 480, 100352, grid=grid(480), stream=stream0)
        buf106 = buf98; del buf98  # reuse
        buf107 = buf106; del buf106  # reuse
        buf111 = empty_strided((120, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_38.run(buf107, buf105, primals_328, buf111, 120, 4, grid=grid(120), stream=stream0)
        del primals_328
        buf108 = buf105; del buf105  # reuse
        triton_red_fused__native_batch_norm_legit_functional_39.run(buf104, buf107, buf108, 480, 100352, grid=grid(480), stream=stream0)
        buf109 = empty_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda', dtype=torch.float32)
        buf110 = empty_strided((120, ), (1, ), device='cuda', dtype=torch.float32)
        buf112 = empty_strided((120, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_40.run(buf108, primals_329, buf109, buf110, buf112, 120, 4, grid=grid(120), stream=stream0)
        del primals_329
        buf113 = empty_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda', dtype=torch.float16)
        buf1153 = empty_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda', dtype=torch.bool)
        triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_43.run(buf104, buf107, buf109, primals_15, primals_16, buf113, buf1153, 48168960, grid=grid(48168960), stream=stream0)
        del buf109
        del primals_16
        buf114 = empty_strided((20, 60, 1, 1), (60, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_44.run(primals_130, buf114, 1200, grid=grid(1200), stream=stream0)
        del primals_130
        buf115 = extern_kernels.convolution(as_strided(buf113, (128, 60, 56, 56), (376320, 3136, 56, 1)), buf114, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf115, (128, 20, 56, 56), (62720, 3136, 56, 1))
        buf116 = empty_strided((20, 60, 1, 1), (60, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_44.run(primals_131, buf116, 1200, grid=grid(1200), stream=stream0)
        del primals_131
        buf117 = extern_kernels.convolution(as_strided(buf113, (128, 60, 56, 56), (376320, 3136, 56, 1), 188160), buf116, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf117, (128, 20, 56, 56), (62720, 3136, 56, 1))
        buf120 = empty_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda', dtype=torch.float16)
        buf118 = as_strided(buf120, (128, 20, 56, 56), (125440, 3136, 56, 1))  # alias
        triton_poi_fused_cat_29.run(buf115, buf118, 8028160, grid=grid(8028160), stream=stream0)
        del buf115
        buf119 = as_strided(buf120, (128, 20, 56, 56), (125440, 3136, 56, 1), 62720)  # alias
        triton_poi_fused_cat_29.run(buf117, buf119, 8028160, grid=grid(8028160), stream=stream0)
        del buf117
        buf121 = buf81; del buf81  # reuse
        triton_red_fused__native_batch_norm_legit_functional_30.run(buf120, buf121, 520, 30878, grid=grid(520), stream=stream0)
        buf122 = buf82; del buf82  # reuse
        buf123 = buf122; del buf122  # reuse
        buf127 = empty_strided((40, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_31.run(buf123, buf121, primals_331, buf127, 40, 13, grid=grid(40), stream=stream0)
        del primals_331
        buf124 = buf121; del buf121  # reuse
        triton_red_fused__native_batch_norm_legit_functional_32.run(buf120, buf123, buf124, 520, 30878, grid=grid(520), stream=stream0)
        buf125 = empty_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda', dtype=torch.float32)
        buf126 = empty_strided((40, ), (1, ), device='cuda', dtype=torch.float32)
        buf128 = empty_strided((40, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_33.run(buf124, primals_332, buf125, buf126, buf128, 40, 13, grid=grid(40), stream=stream0)
        del buf124
        del primals_332
        buf129 = empty_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_45.run(buf120, buf123, buf125, primals_17, primals_18, buf86, buf129, 16056320, grid=grid(16056320), stream=stream0)
        del buf125
        del primals_18
        buf130 = empty_strided((240, 40, 1, 1), (40, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_46.run(primals_132, buf130, 9600, grid=grid(9600), stream=stream0)
        del primals_132
        buf131 = extern_kernels.convolution(buf129, buf130, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf131, (128, 240, 56, 56), (752640, 3136, 56, 1))
        buf132 = empty_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda', dtype=torch.float32)
        buf133 = buf132; del buf132  # reuse
        buf136 = empty_strided((240, ), (1, ), device='cuda', dtype=torch.float32)
        buf134 = empty_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda', dtype=torch.float32)
        buf135 = empty_strided((240, ), (1, ), device='cuda', dtype=torch.float32)
        buf137 = empty_strided((240, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_47.run(buf133, buf131, primals_334, primals_335, buf136, buf134, buf135, buf137, 240, 401408, grid=grid(240), stream=stream0)
        del primals_334
        del primals_335
        buf138 = empty_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda', dtype=torch.float16)
        buf1152 = empty_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_48.run(buf131, buf133, buf134, primals_19, primals_20, buf138, buf1152, 96337920, grid=grid(96337920), stream=stream0)
        del primals_20
        buf139 = empty_strided((60, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_49.run(primals_133, buf139, 540, grid=grid(540), stream=stream0)
        del primals_133
        buf140 = buf90; del buf90  # reuse
        triton_poi_fused_split_with_sizes_50.run(buf138, buf140, 24084480, grid=grid(24084480), stream=stream0)
        buf141 = extern_kernels.convolution(buf140, buf139, stride=(2, 2), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=60, bias=None)
        assert_size_stride(buf141, (128, 60, 28, 28), (47040, 784, 28, 1))
        buf142 = empty_strided((60, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_51.run(primals_134, buf142, 1500, grid=grid(1500), stream=stream0)
        del primals_134
        buf143 = buf88; del buf88  # reuse
        triton_poi_fused_split_with_sizes_52.run(buf138, buf143, 24084480, grid=grid(24084480), stream=stream0)
        buf144 = extern_kernels.convolution(buf143, buf142, stride=(2, 2), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=60, bias=None)
        assert_size_stride(buf144, (128, 60, 28, 28), (47040, 784, 28, 1))
        buf145 = empty_strided((60, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_53.run(primals_135, buf145, 2940, grid=grid(2940), stream=stream0)
        del primals_135
        buf146 = empty_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_split_with_sizes_54.run(buf138, buf146, 24084480, grid=grid(24084480), stream=stream0)
        buf147 = extern_kernels.convolution(buf146, buf145, stride=(2, 2), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=60, bias=None)
        assert_size_stride(buf147, (128, 60, 28, 28), (47040, 784, 28, 1))
        buf148 = empty_strided((60, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_55.run(primals_136, buf148, 4860, grid=grid(4860), stream=stream0)
        del primals_136
        buf149 = empty_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_split_with_sizes_56.run(buf138, buf149, 24084480, grid=grid(24084480), stream=stream0)
        del buf138
        buf150 = extern_kernels.convolution(buf149, buf148, stride=(2, 2), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=60, bias=None)
        assert_size_stride(buf150, (128, 60, 28, 28), (47040, 784, 28, 1))
        buf155 = empty_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda', dtype=torch.float16)
        buf151 = as_strided(buf155, (128, 60, 28, 28), (188160, 784, 28, 1))  # alias
        triton_poi_fused_cat_57.run(buf141, buf151, 6021120, grid=grid(6021120), stream=stream0)
        buf152 = as_strided(buf155, (128, 60, 28, 28), (188160, 784, 28, 1), 47040)  # alias
        triton_poi_fused_cat_57.run(buf144, buf152, 6021120, grid=grid(6021120), stream=stream0)
        buf153 = as_strided(buf155, (128, 60, 28, 28), (188160, 784, 28, 1), 94080)  # alias
        triton_poi_fused_cat_57.run(buf147, buf153, 6021120, grid=grid(6021120), stream=stream0)
        buf154 = as_strided(buf155, (128, 60, 28, 28), (188160, 784, 28, 1), 141120)  # alias
        triton_poi_fused_cat_57.run(buf150, buf154, 6021120, grid=grid(6021120), stream=stream0)
        buf156 = buf134; del buf134  # reuse
        buf157 = buf156; del buf156  # reuse
        buf160 = empty_strided((240, ), (1, ), device='cuda', dtype=torch.float32)
        buf158 = empty_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda', dtype=torch.float32)
        buf159 = empty_strided((240, ), (1, ), device='cuda', dtype=torch.float32)
        buf161 = empty_strided((240, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_58.run(buf157, buf155, primals_337, primals_338, buf160, buf158, buf159, buf161, 240, 100352, grid=grid(240), stream=stream0)
        del primals_337
        del primals_338
        buf162 = empty_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda', dtype=torch.float16)
        buf164 = empty_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_per_fused__native_batch_norm_legit_functional_mean_silu_59.run(buf155, buf157, buf158, primals_21, primals_22, buf162, buf164, 30720, 784, grid=grid(30720), stream=stream0)
        del buf158
        del primals_22
        buf165 = empty_strided((20, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_60.run(primals_137, buf165, 4800, grid=grid(4800), stream=stream0)
        del primals_137
        buf166 = empty_strided((20, ), (1, ), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_convolution_61.run(primals_138, buf166, 20, grid=grid(20), stream=stream0)
        del primals_138
        buf167 = extern_kernels.convolution(buf164, buf165, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf167, (128, 20, 1, 1), (20, 1, 1, 1))
        buf168 = buf167; del buf167  # reuse
        buf169 = empty_strided((128, 20, 1, 1), (20, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_convolution_silu_62.run(buf168, buf166, buf169, 2560, grid=grid(2560), stream=stream0)
        del buf166
        buf170 = empty_strided((240, 20, 1, 1), (20, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_63.run(primals_139, buf170, 4800, grid=grid(4800), stream=stream0)
        del primals_139
        buf171 = empty_strided((240, ), (1, ), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_convolution_64.run(primals_140, buf171, 240, grid=grid(240), stream=stream0)
        del primals_140
        buf172 = extern_kernels.convolution(buf169, buf170, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf172, (128, 240, 1, 1), (240, 1, 1, 1))
        buf173 = buf172; del buf172  # reuse
        triton_poi_fused__to_copy_convolution_65.run(buf173, buf171, 30720, grid=grid(30720), stream=stream0)
        del buf171
        buf174 = empty_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_mul_sigmoid_silu_66.run(buf162, buf173, buf174, 24084480, grid=grid(24084480), stream=stream0)
        buf175 = empty_strided((56, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_67.run(primals_141, buf175, 13440, grid=grid(13440), stream=stream0)
        del primals_141
        buf176 = extern_kernels.convolution(buf174, buf175, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf176, (128, 56, 28, 28), (43904, 784, 28, 1))
        buf177 = empty_strided((1, 56, 1, 1, 13), (728, 1, 728, 728, 56), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_68.run(buf176, buf177, 728, 7720, grid=grid(728), stream=stream0)
        buf178 = empty_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda', dtype=torch.float32)
        buf179 = buf178; del buf178  # reuse
        buf183 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_69.run(buf179, buf177, primals_340, buf183, 56, 13, grid=grid(56), stream=stream0)
        del primals_340
        buf180 = buf177; del buf177  # reuse
        triton_red_fused__native_batch_norm_legit_functional_70.run(buf176, buf179, buf180, 728, 7720, grid=grid(728), stream=stream0)
        buf181 = empty_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda', dtype=torch.float32)
        buf182 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32)
        buf184 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_71.run(buf180, primals_341, buf181, buf182, buf184, 56, 13, grid=grid(56), stream=stream0)
        del primals_341
        buf185 = empty_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_72.run(buf176, buf179, buf181, primals_23, primals_24, buf185, 5619712, grid=grid(5619712), stream=stream0)
        del primals_24
        buf186 = empty_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_73.run(primals_142, buf186, 4704, grid=grid(4704), stream=stream0)
        del primals_142
        buf187 = extern_kernels.convolution(as_strided(buf185, (128, 28, 28, 28), (43904, 784, 28, 1)), buf186, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf187, (128, 168, 28, 28), (131712, 784, 28, 1))
        buf188 = empty_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_73.run(primals_143, buf188, 4704, grid=grid(4704), stream=stream0)
        del primals_143
        buf189 = extern_kernels.convolution(as_strided(buf185, (128, 28, 28, 28), (43904, 784, 28, 1), 21952), buf188, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf189, (128, 168, 28, 28), (131712, 784, 28, 1))
        buf192 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16)
        buf190 = as_strided(buf192, (128, 168, 28, 28), (263424, 784, 28, 1))  # alias
        triton_poi_fused_cat_74.run(buf187, buf190, 16859136, grid=grid(16859136), stream=stream0)
        buf191 = as_strided(buf192, (128, 168, 28, 28), (263424, 784, 28, 1), 131712)  # alias
        triton_poi_fused_cat_74.run(buf189, buf191, 16859136, grid=grid(16859136), stream=stream0)
        buf193 = empty_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda', dtype=torch.float32)
        buf194 = buf193; del buf193  # reuse
        buf197 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
        buf195 = empty_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda', dtype=torch.float32)
        buf196 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
        buf198 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_75.run(buf194, buf192, primals_343, primals_344, buf197, buf195, buf196, buf198, 336, 100352, grid=grid(336), stream=stream0)
        del primals_343
        del primals_344
        buf199 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16)
        buf1150 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76.run(buf192, buf194, buf195, primals_25, primals_26, buf199, buf1150, 33718272, grid=grid(33718272), stream=stream0)
        del primals_26
        buf200 = empty_strided((168, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_77.run(primals_144, buf200, 1512, grid=grid(1512), stream=stream0)
        del primals_144
        buf201 = buf189; del buf189  # reuse
        triton_poi_fused_split_with_sizes_78.run(buf199, buf201, 16859136, grid=grid(16859136), stream=stream0)
        buf202 = extern_kernels.convolution(buf201, buf200, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=168, bias=None)
        assert_size_stride(buf202, (128, 168, 28, 28), (131712, 784, 28, 1))
        buf203 = empty_strided((168, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_79.run(primals_145, buf203, 4200, grid=grid(4200), stream=stream0)
        del primals_145
        buf204 = buf187; del buf187  # reuse
        triton_poi_fused_split_with_sizes_80.run(buf199, buf204, 16859136, grid=grid(16859136), stream=stream0)
        buf205 = extern_kernels.convolution(buf204, buf203, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=168, bias=None)
        assert_size_stride(buf205, (128, 168, 28, 28), (131712, 784, 28, 1))
        buf208 = buf199; del buf199  # reuse
        buf206 = as_strided(buf208, (128, 168, 28, 28), (263424, 784, 28, 1))  # alias
        triton_poi_fused_cat_74.run(buf202, buf206, 16859136, grid=grid(16859136), stream=stream0)
        buf207 = as_strided(buf208, (128, 168, 28, 28), (263424, 784, 28, 1), 131712)  # alias
        triton_poi_fused_cat_74.run(buf205, buf207, 16859136, grid=grid(16859136), stream=stream0)
        buf209 = buf195; del buf195  # reuse
        buf210 = buf209; del buf209  # reuse
        buf213 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
        buf211 = empty_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda', dtype=torch.float32)
        buf212 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
        buf214 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_75.run(buf210, buf208, primals_346, primals_347, buf213, buf211, buf212, buf214, 336, 100352, grid=grid(336), stream=stream0)
        del primals_346
        del primals_347
        buf215 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16)
        buf217 = empty_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_per_fused__native_batch_norm_legit_functional_mean_silu_81.run(buf208, buf210, buf211, primals_27, primals_28, buf215, buf217, 43008, 784, grid=grid(43008), stream=stream0)
        del primals_28
        buf218 = empty_strided((28, 336, 1, 1), (336, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_82.run(primals_146, buf218, 9408, grid=grid(9408), stream=stream0)
        del primals_146
        buf219 = empty_strided((28, ), (1, ), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_convolution_83.run(primals_147, buf219, 28, grid=grid(28), stream=stream0)
        del primals_147
        buf220 = extern_kernels.convolution(buf217, buf218, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf220, (128, 28, 1, 1), (28, 1, 1, 1))
        buf221 = buf220; del buf220  # reuse
        buf222 = empty_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_convolution_silu_84.run(buf221, buf219, buf222, 3584, grid=grid(3584), stream=stream0)
        buf223 = empty_strided((336, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_85.run(primals_148, buf223, 9408, grid=grid(9408), stream=stream0)
        del primals_148
        buf224 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_convolution_86.run(primals_149, buf224, 336, grid=grid(336), stream=stream0)
        del primals_149
        buf225 = extern_kernels.convolution(buf222, buf223, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf225, (128, 336, 1, 1), (336, 1, 1, 1))
        buf226 = buf225; del buf225  # reuse
        triton_poi_fused__to_copy_convolution_87.run(buf226, buf224, 43008, grid=grid(43008), stream=stream0)
        buf227 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_mul_sigmoid_silu_88.run(buf215, buf226, buf227, 33718272, grid=grid(33718272), stream=stream0)
        buf228 = buf205; del buf205  # reuse
        triton_poi_fused_split_with_sizes_89.run(buf227, buf228, 16859136, grid=grid(16859136), stream=stream0)
        buf229 = buf202; del buf202  # reuse
        triton_poi_fused_split_with_sizes_90.run(buf227, buf229, 16859136, grid=grid(16859136), stream=stream0)
        buf230 = empty_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_91.run(primals_150, buf230, 4704, grid=grid(4704), stream=stream0)
        del primals_150
        buf231 = extern_kernels.convolution(buf228, buf230, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf231, (128, 28, 28, 28), (21952, 784, 28, 1))
        buf232 = empty_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_91.run(primals_151, buf232, 4704, grid=grid(4704), stream=stream0)
        del primals_151
        buf233 = extern_kernels.convolution(buf229, buf232, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf233, (128, 28, 28, 28), (21952, 784, 28, 1))
        buf236 = empty_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda', dtype=torch.float16)
        buf234 = as_strided(buf236, (128, 28, 28, 28), (43904, 784, 28, 1))  # alias
        triton_poi_fused_cat_92.run(buf231, buf234, 2809856, grid=grid(2809856), stream=stream0)
        del buf231
        buf235 = as_strided(buf236, (128, 28, 28, 28), (43904, 784, 28, 1), 21952)  # alias
        triton_poi_fused_cat_92.run(buf233, buf235, 2809856, grid=grid(2809856), stream=stream0)
        del buf233
        buf237 = buf180; del buf180  # reuse
        triton_red_fused__native_batch_norm_legit_functional_68.run(buf236, buf237, 728, 7720, grid=grid(728), stream=stream0)
        buf238 = buf181; del buf181  # reuse
        buf239 = buf238; del buf238  # reuse
        buf243 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_69.run(buf239, buf237, primals_349, buf243, 56, 13, grid=grid(56), stream=stream0)
        del primals_349
        buf240 = buf237; del buf237  # reuse
        triton_red_fused__native_batch_norm_legit_functional_70.run(buf236, buf239, buf240, 728, 7720, grid=grid(728), stream=stream0)
        buf241 = empty_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda', dtype=torch.float32)
        buf242 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32)
        buf244 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_71.run(buf240, primals_350, buf241, buf242, buf244, 56, 13, grid=grid(56), stream=stream0)
        del primals_350
        buf245 = empty_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_93.run(buf236, buf239, buf241, primals_29, primals_30, buf185, buf245, 5619712, grid=grid(5619712), stream=stream0)
        del primals_30
        buf246 = empty_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_73.run(primals_152, buf246, 4704, grid=grid(4704), stream=stream0)
        del primals_152
        buf247 = extern_kernels.convolution(as_strided(buf245, (128, 28, 28, 28), (43904, 784, 28, 1)), buf246, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf247, (128, 168, 28, 28), (131712, 784, 28, 1))
        buf248 = empty_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_73.run(primals_153, buf248, 4704, grid=grid(4704), stream=stream0)
        del primals_153
        buf249 = extern_kernels.convolution(as_strided(buf245, (128, 28, 28, 28), (43904, 784, 28, 1), 21952), buf248, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf249, (128, 168, 28, 28), (131712, 784, 28, 1))
        buf252 = buf227; del buf227  # reuse
        buf250 = as_strided(buf252, (128, 168, 28, 28), (263424, 784, 28, 1))  # alias
        triton_poi_fused_cat_74.run(buf247, buf250, 16859136, grid=grid(16859136), stream=stream0)
        buf251 = as_strided(buf252, (128, 168, 28, 28), (263424, 784, 28, 1), 131712)  # alias
        triton_poi_fused_cat_74.run(buf249, buf251, 16859136, grid=grid(16859136), stream=stream0)
        buf253 = buf211; del buf211  # reuse
        buf254 = buf253; del buf253  # reuse
        buf257 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
        buf255 = empty_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda', dtype=torch.float32)
        buf256 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
        buf258 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_75.run(buf254, buf252, primals_352, primals_353, buf257, buf255, buf256, buf258, 336, 100352, grid=grid(336), stream=stream0)
        del primals_352
        del primals_353
        buf259 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16)
        buf1148 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76.run(buf252, buf254, buf255, primals_31, primals_32, buf259, buf1148, 33718272, grid=grid(33718272), stream=stream0)
        del primals_32
        buf260 = empty_strided((168, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_77.run(primals_154, buf260, 1512, grid=grid(1512), stream=stream0)
        del primals_154
        buf261 = buf249; del buf249  # reuse
        triton_poi_fused_split_with_sizes_78.run(buf259, buf261, 16859136, grid=grid(16859136), stream=stream0)
        buf262 = extern_kernels.convolution(buf261, buf260, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=168, bias=None)
        assert_size_stride(buf262, (128, 168, 28, 28), (131712, 784, 28, 1))
        buf263 = empty_strided((168, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_79.run(primals_155, buf263, 4200, grid=grid(4200), stream=stream0)
        del primals_155
        buf264 = buf247; del buf247  # reuse
        triton_poi_fused_split_with_sizes_80.run(buf259, buf264, 16859136, grid=grid(16859136), stream=stream0)
        buf265 = extern_kernels.convolution(buf264, buf263, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=168, bias=None)
        assert_size_stride(buf265, (128, 168, 28, 28), (131712, 784, 28, 1))
        buf268 = buf259; del buf259  # reuse
        buf266 = as_strided(buf268, (128, 168, 28, 28), (263424, 784, 28, 1))  # alias
        triton_poi_fused_cat_74.run(buf262, buf266, 16859136, grid=grid(16859136), stream=stream0)
        buf267 = as_strided(buf268, (128, 168, 28, 28), (263424, 784, 28, 1), 131712)  # alias
        triton_poi_fused_cat_74.run(buf265, buf267, 16859136, grid=grid(16859136), stream=stream0)
        buf269 = buf255; del buf255  # reuse
        buf270 = buf269; del buf269  # reuse
        buf273 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
        buf271 = empty_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda', dtype=torch.float32)
        buf272 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
        buf274 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_75.run(buf270, buf268, primals_355, primals_356, buf273, buf271, buf272, buf274, 336, 100352, grid=grid(336), stream=stream0)
        del primals_355
        del primals_356
        buf275 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16)
        buf277 = empty_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_per_fused__native_batch_norm_legit_functional_mean_silu_81.run(buf268, buf270, buf271, primals_33, primals_34, buf275, buf277, 43008, 784, grid=grid(43008), stream=stream0)
        del primals_34
        buf278 = empty_strided((28, 336, 1, 1), (336, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_82.run(primals_156, buf278, 9408, grid=grid(9408), stream=stream0)
        del primals_156
        buf279 = buf219; del buf219  # reuse
        triton_poi_fused__to_copy_convolution_83.run(primals_157, buf279, 28, grid=grid(28), stream=stream0)
        del primals_157
        buf280 = extern_kernels.convolution(buf277, buf278, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf280, (128, 28, 1, 1), (28, 1, 1, 1))
        buf281 = buf280; del buf280  # reuse
        buf282 = empty_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_convolution_silu_84.run(buf281, buf279, buf282, 3584, grid=grid(3584), stream=stream0)
        buf283 = empty_strided((336, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_85.run(primals_158, buf283, 9408, grid=grid(9408), stream=stream0)
        del primals_158
        buf284 = buf224; del buf224  # reuse
        triton_poi_fused__to_copy_convolution_86.run(primals_159, buf284, 336, grid=grid(336), stream=stream0)
        del primals_159
        buf285 = extern_kernels.convolution(buf282, buf283, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf285, (128, 336, 1, 1), (336, 1, 1, 1))
        buf286 = buf285; del buf285  # reuse
        triton_poi_fused__to_copy_convolution_87.run(buf286, buf284, 43008, grid=grid(43008), stream=stream0)
        buf287 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_mul_sigmoid_silu_88.run(buf275, buf286, buf287, 33718272, grid=grid(33718272), stream=stream0)
        buf288 = buf265; del buf265  # reuse
        triton_poi_fused_split_with_sizes_89.run(buf287, buf288, 16859136, grid=grid(16859136), stream=stream0)
        buf289 = buf262; del buf262  # reuse
        triton_poi_fused_split_with_sizes_90.run(buf287, buf289, 16859136, grid=grid(16859136), stream=stream0)
        buf290 = empty_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_91.run(primals_160, buf290, 4704, grid=grid(4704), stream=stream0)
        del primals_160
        buf291 = extern_kernels.convolution(buf288, buf290, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf291, (128, 28, 28, 28), (21952, 784, 28, 1))
        buf292 = empty_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_91.run(primals_161, buf292, 4704, grid=grid(4704), stream=stream0)
        del primals_161
        buf293 = extern_kernels.convolution(buf289, buf292, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf293, (128, 28, 28, 28), (21952, 784, 28, 1))
        buf296 = empty_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda', dtype=torch.float16)
        buf294 = as_strided(buf296, (128, 28, 28, 28), (43904, 784, 28, 1))  # alias
        triton_poi_fused_cat_92.run(buf291, buf294, 2809856, grid=grid(2809856), stream=stream0)
        del buf291
        buf295 = as_strided(buf296, (128, 28, 28, 28), (43904, 784, 28, 1), 21952)  # alias
        triton_poi_fused_cat_92.run(buf293, buf295, 2809856, grid=grid(2809856), stream=stream0)
        del buf293
        buf297 = buf240; del buf240  # reuse
        triton_red_fused__native_batch_norm_legit_functional_68.run(buf296, buf297, 728, 7720, grid=grid(728), stream=stream0)
        buf298 = buf241; del buf241  # reuse
        buf299 = buf298; del buf298  # reuse
        buf303 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_69.run(buf299, buf297, primals_358, buf303, 56, 13, grid=grid(56), stream=stream0)
        del primals_358
        buf300 = buf297; del buf297  # reuse
        triton_red_fused__native_batch_norm_legit_functional_70.run(buf296, buf299, buf300, 728, 7720, grid=grid(728), stream=stream0)
        buf301 = empty_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda', dtype=torch.float32)
        buf302 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32)
        buf304 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_71.run(buf300, primals_359, buf301, buf302, buf304, 56, 13, grid=grid(56), stream=stream0)
        del primals_359
        buf305 = empty_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_93.run(buf296, buf299, buf301, primals_35, primals_36, buf245, buf305, 5619712, grid=grid(5619712), stream=stream0)
        del primals_36
        buf306 = empty_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_73.run(primals_162, buf306, 4704, grid=grid(4704), stream=stream0)
        del primals_162
        buf307 = extern_kernels.convolution(as_strided(buf305, (128, 28, 28, 28), (43904, 784, 28, 1)), buf306, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf307, (128, 168, 28, 28), (131712, 784, 28, 1))
        buf308 = empty_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_73.run(primals_163, buf308, 4704, grid=grid(4704), stream=stream0)
        del primals_163
        buf309 = extern_kernels.convolution(as_strided(buf305, (128, 28, 28, 28), (43904, 784, 28, 1), 21952), buf308, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf309, (128, 168, 28, 28), (131712, 784, 28, 1))
        buf312 = buf287; del buf287  # reuse
        buf310 = as_strided(buf312, (128, 168, 28, 28), (263424, 784, 28, 1))  # alias
        triton_poi_fused_cat_74.run(buf307, buf310, 16859136, grid=grid(16859136), stream=stream0)
        buf311 = as_strided(buf312, (128, 168, 28, 28), (263424, 784, 28, 1), 131712)  # alias
        triton_poi_fused_cat_74.run(buf309, buf311, 16859136, grid=grid(16859136), stream=stream0)
        buf313 = buf271; del buf271  # reuse
        buf314 = buf313; del buf313  # reuse
        buf317 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
        buf315 = empty_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda', dtype=torch.float32)
        buf316 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
        buf318 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_75.run(buf314, buf312, primals_361, primals_362, buf317, buf315, buf316, buf318, 336, 100352, grid=grid(336), stream=stream0)
        del primals_361
        del primals_362
        buf319 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16)
        buf1146 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76.run(buf312, buf314, buf315, primals_37, primals_38, buf319, buf1146, 33718272, grid=grid(33718272), stream=stream0)
        del primals_38
        buf320 = empty_strided((168, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_77.run(primals_164, buf320, 1512, grid=grid(1512), stream=stream0)
        del primals_164
        buf321 = buf309; del buf309  # reuse
        triton_poi_fused_split_with_sizes_78.run(buf319, buf321, 16859136, grid=grid(16859136), stream=stream0)
        buf322 = extern_kernels.convolution(buf321, buf320, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=168, bias=None)
        assert_size_stride(buf322, (128, 168, 28, 28), (131712, 784, 28, 1))
        buf323 = empty_strided((168, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_79.run(primals_165, buf323, 4200, grid=grid(4200), stream=stream0)
        del primals_165
        buf324 = buf307; del buf307  # reuse
        triton_poi_fused_split_with_sizes_80.run(buf319, buf324, 16859136, grid=grid(16859136), stream=stream0)
        buf325 = extern_kernels.convolution(buf324, buf323, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=168, bias=None)
        assert_size_stride(buf325, (128, 168, 28, 28), (131712, 784, 28, 1))
        buf328 = buf319; del buf319  # reuse
        buf326 = as_strided(buf328, (128, 168, 28, 28), (263424, 784, 28, 1))  # alias
        triton_poi_fused_cat_74.run(buf322, buf326, 16859136, grid=grid(16859136), stream=stream0)
        buf327 = as_strided(buf328, (128, 168, 28, 28), (263424, 784, 28, 1), 131712)  # alias
        triton_poi_fused_cat_74.run(buf325, buf327, 16859136, grid=grid(16859136), stream=stream0)
        buf329 = buf315; del buf315  # reuse
        buf330 = buf329; del buf329  # reuse
        buf333 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
        buf331 = empty_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda', dtype=torch.float32)
        buf332 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
        buf334 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_75.run(buf330, buf328, primals_364, primals_365, buf333, buf331, buf332, buf334, 336, 100352, grid=grid(336), stream=stream0)
        del primals_364
        del primals_365
        buf335 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16)
        buf337 = empty_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_per_fused__native_batch_norm_legit_functional_mean_silu_81.run(buf328, buf330, buf331, primals_39, primals_40, buf335, buf337, 43008, 784, grid=grid(43008), stream=stream0)
        del primals_40
        buf338 = empty_strided((28, 336, 1, 1), (336, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_82.run(primals_166, buf338, 9408, grid=grid(9408), stream=stream0)
        del primals_166
        buf339 = buf279; del buf279  # reuse
        triton_poi_fused__to_copy_convolution_83.run(primals_167, buf339, 28, grid=grid(28), stream=stream0)
        del primals_167
        buf340 = extern_kernels.convolution(buf337, buf338, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf340, (128, 28, 1, 1), (28, 1, 1, 1))
        buf341 = buf340; del buf340  # reuse
        buf342 = empty_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_convolution_silu_84.run(buf341, buf339, buf342, 3584, grid=grid(3584), stream=stream0)
        del buf339
        buf343 = empty_strided((336, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_85.run(primals_168, buf343, 9408, grid=grid(9408), stream=stream0)
        del primals_168
        buf344 = buf284; del buf284  # reuse
        triton_poi_fused__to_copy_convolution_86.run(primals_169, buf344, 336, grid=grid(336), stream=stream0)
        del primals_169
        buf345 = extern_kernels.convolution(buf342, buf343, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf345, (128, 336, 1, 1), (336, 1, 1, 1))
        buf346 = buf345; del buf345  # reuse
        triton_poi_fused__to_copy_convolution_87.run(buf346, buf344, 43008, grid=grid(43008), stream=stream0)
        buf347 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_mul_sigmoid_silu_88.run(buf335, buf346, buf347, 33718272, grid=grid(33718272), stream=stream0)
        buf348 = buf325; del buf325  # reuse
        triton_poi_fused_split_with_sizes_89.run(buf347, buf348, 16859136, grid=grid(16859136), stream=stream0)
        buf349 = buf322; del buf322  # reuse
        triton_poi_fused_split_with_sizes_90.run(buf347, buf349, 16859136, grid=grid(16859136), stream=stream0)
        buf350 = empty_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_91.run(primals_170, buf350, 4704, grid=grid(4704), stream=stream0)
        del primals_170
        buf351 = extern_kernels.convolution(buf348, buf350, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf351, (128, 28, 28, 28), (21952, 784, 28, 1))
        buf352 = empty_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_91.run(primals_171, buf352, 4704, grid=grid(4704), stream=stream0)
        del primals_171
        buf353 = extern_kernels.convolution(buf349, buf352, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf353, (128, 28, 28, 28), (21952, 784, 28, 1))
        buf356 = empty_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda', dtype=torch.float16)
        buf354 = as_strided(buf356, (128, 28, 28, 28), (43904, 784, 28, 1))  # alias
        triton_poi_fused_cat_92.run(buf351, buf354, 2809856, grid=grid(2809856), stream=stream0)
        del buf351
        buf355 = as_strided(buf356, (128, 28, 28, 28), (43904, 784, 28, 1), 21952)  # alias
        triton_poi_fused_cat_92.run(buf353, buf355, 2809856, grid=grid(2809856), stream=stream0)
        del buf353
        buf357 = buf300; del buf300  # reuse
        triton_red_fused__native_batch_norm_legit_functional_68.run(buf356, buf357, 728, 7720, grid=grid(728), stream=stream0)
        buf358 = buf301; del buf301  # reuse
        buf359 = buf358; del buf358  # reuse
        buf363 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_69.run(buf359, buf357, primals_367, buf363, 56, 13, grid=grid(56), stream=stream0)
        del primals_367
        buf360 = buf357; del buf357  # reuse
        triton_red_fused__native_batch_norm_legit_functional_70.run(buf356, buf359, buf360, 728, 7720, grid=grid(728), stream=stream0)
        buf361 = empty_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda', dtype=torch.float32)
        buf362 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32)
        buf364 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_71.run(buf360, primals_368, buf361, buf362, buf364, 56, 13, grid=grid(56), stream=stream0)
        del buf360
        del primals_368
        buf365 = empty_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_93.run(buf356, buf359, buf361, primals_41, primals_42, buf305, buf365, 5619712, grid=grid(5619712), stream=stream0)
        del buf361
        del primals_42
        buf366 = empty_strided((336, 56, 1, 1), (56, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_94.run(primals_172, buf366, 18816, grid=grid(18816), stream=stream0)
        del primals_172
        buf367 = extern_kernels.convolution(buf365, buf366, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf367, (128, 336, 28, 28), (263424, 784, 28, 1))
        buf368 = buf331; del buf331  # reuse
        buf369 = buf368; del buf368  # reuse
        buf372 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
        buf370 = empty_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda', dtype=torch.float32)
        buf371 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
        buf373 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_75.run(buf369, buf367, primals_370, primals_371, buf372, buf370, buf371, buf373, 336, 100352, grid=grid(336), stream=stream0)
        del primals_370
        del primals_371
        buf374 = buf347; del buf347  # reuse
        buf1144 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76.run(buf367, buf369, buf370, primals_43, primals_44, buf374, buf1144, 33718272, grid=grid(33718272), stream=stream0)
        del primals_44
        buf375 = empty_strided((112, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_95.run(primals_173, buf375, 1008, grid=grid(1008), stream=stream0)
        del primals_173
        buf376 = empty_strided((128, 112, 28, 28), (87808, 784, 28, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_split_with_sizes_96.run(buf374, buf376, 11239424, grid=grid(11239424), stream=stream0)
        buf377 = extern_kernels.convolution(buf376, buf375, stride=(2, 2), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=112, bias=None)
        assert_size_stride(buf377, (128, 112, 14, 14), (21952, 196, 14, 1))
        buf378 = empty_strided((112, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_97.run(primals_174, buf378, 2800, grid=grid(2800), stream=stream0)
        del primals_174
        buf379 = empty_strided((128, 112, 28, 28), (87808, 784, 28, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_split_with_sizes_98.run(buf374, buf379, 11239424, grid=grid(11239424), stream=stream0)
        buf380 = extern_kernels.convolution(buf379, buf378, stride=(2, 2), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=112, bias=None)
        assert_size_stride(buf380, (128, 112, 14, 14), (21952, 196, 14, 1))
        buf381 = empty_strided((112, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_99.run(primals_175, buf381, 5488, grid=grid(5488), stream=stream0)
        del primals_175
        buf382 = empty_strided((128, 112, 28, 28), (87808, 784, 28, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_split_with_sizes_100.run(buf374, buf382, 11239424, grid=grid(11239424), stream=stream0)
        del buf374
        buf383 = extern_kernels.convolution(buf382, buf381, stride=(2, 2), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=112, bias=None)
        assert_size_stride(buf383, (128, 112, 14, 14), (21952, 196, 14, 1))
        buf387 = empty_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda', dtype=torch.float16)
        buf384 = as_strided(buf387, (128, 112, 14, 14), (65856, 196, 14, 1))  # alias
        triton_poi_fused_cat_101.run(buf377, buf384, 2809856, grid=grid(2809856), stream=stream0)
        del buf377
        buf385 = as_strided(buf387, (128, 112, 14, 14), (65856, 196, 14, 1), 21952)  # alias
        triton_poi_fused_cat_101.run(buf380, buf385, 2809856, grid=grid(2809856), stream=stream0)
        del buf380
        buf386 = as_strided(buf387, (128, 112, 14, 14), (65856, 196, 14, 1), 43904)  # alias
        triton_poi_fused_cat_101.run(buf383, buf386, 2809856, grid=grid(2809856), stream=stream0)
        del buf383
        buf388 = buf370; del buf370  # reuse
        buf389 = buf388; del buf388  # reuse
        buf392 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
        buf390 = empty_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda', dtype=torch.float32)
        buf391 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
        buf393 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_102.run(buf389, buf387, primals_373, primals_374, buf392, buf390, buf391, buf393, 336, 25088, grid=grid(336), stream=stream0)
        del primals_373
        del primals_374
        buf394 = empty_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda', dtype=torch.float16)
        buf396 = empty_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_per_fused__native_batch_norm_legit_functional_mean_silu_103.run(buf387, buf389, buf390, primals_45, primals_46, buf394, buf396, 43008, 196, grid=grid(43008), stream=stream0)
        del buf390
        del primals_46
        buf397 = empty_strided((14, 336, 1, 1), (336, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_104.run(primals_176, buf397, 4704, grid=grid(4704), stream=stream0)
        del primals_176
        buf398 = empty_strided((14, ), (1, ), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_convolution_105.run(primals_177, buf398, 14, grid=grid(14), stream=stream0)
        del primals_177
        buf399 = extern_kernels.convolution(buf396, buf397, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf399, (128, 14, 1, 1), (14, 1, 1, 1))
        buf400 = buf399; del buf399  # reuse
        buf401 = empty_strided((128, 14, 1, 1), (14, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_convolution_silu_106.run(buf400, buf398, buf401, 1792, grid=grid(1792), stream=stream0)
        del buf398
        buf402 = empty_strided((336, 14, 1, 1), (14, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_107.run(primals_178, buf402, 4704, grid=grid(4704), stream=stream0)
        del primals_178
        buf403 = buf344; del buf344  # reuse
        triton_poi_fused__to_copy_convolution_86.run(primals_179, buf403, 336, grid=grid(336), stream=stream0)
        del primals_179
        buf404 = extern_kernels.convolution(buf401, buf402, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf404, (128, 336, 1, 1), (336, 1, 1, 1))
        buf405 = buf404; del buf404  # reuse
        triton_poi_fused__to_copy_convolution_87.run(buf405, buf403, 43008, grid=grid(43008), stream=stream0)
        del buf403
        buf406 = empty_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_mul_sigmoid_silu_108.run(buf394, buf405, buf406, 8429568, grid=grid(8429568), stream=stream0)
        buf407 = empty_strided((104, 336, 1, 1), (336, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_109.run(primals_180, buf407, 34944, grid=grid(34944), stream=stream0)
        del primals_180
        buf408 = extern_kernels.convolution(buf406, buf407, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf408, (128, 104, 14, 14), (20384, 196, 14, 1))
        buf409 = empty_strided((1, 104, 1, 1, 4), (416, 1, 416, 416, 104), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_110.run(buf408, buf409, 416, 6272, grid=grid(416), stream=stream0)
        buf410 = empty_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda', dtype=torch.float32)
        buf411 = buf410; del buf410  # reuse
        buf415 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_111.run(buf411, buf409, primals_376, buf415, 104, 4, grid=grid(104), stream=stream0)
        del primals_376
        buf412 = buf409; del buf409  # reuse
        triton_red_fused__native_batch_norm_legit_functional_112.run(buf408, buf411, buf412, 416, 6272, grid=grid(416), stream=stream0)
        buf413 = empty_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda', dtype=torch.float32)
        buf414 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32)
        buf416 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_113.run(buf412, primals_377, buf413, buf414, buf416, 104, 4, grid=grid(104), stream=stream0)
        del primals_377
        buf417 = empty_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_114.run(buf408, buf411, buf413, primals_47, primals_48, buf417, 2609152, grid=grid(2609152), stream=stream0)
        del primals_48
        buf418 = empty_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_115.run(primals_181, buf418, 16224, grid=grid(16224), stream=stream0)
        del primals_181
        buf419 = extern_kernels.convolution(as_strided(buf417, (128, 52, 14, 14), (20384, 196, 14, 1)), buf418, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf419, (128, 312, 14, 14), (61152, 196, 14, 1))
        buf420 = empty_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_115.run(primals_182, buf420, 16224, grid=grid(16224), stream=stream0)
        del primals_182
        buf421 = extern_kernels.convolution(as_strided(buf417, (128, 52, 14, 14), (20384, 196, 14, 1), 10192), buf420, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf421, (128, 312, 14, 14), (61152, 196, 14, 1))
        buf424 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
        buf422 = as_strided(buf424, (128, 312, 14, 14), (122304, 196, 14, 1))  # alias
        triton_poi_fused_cat_116.run(buf419, buf422, 7827456, grid=grid(7827456), stream=stream0)
        buf423 = as_strided(buf424, (128, 312, 14, 14), (122304, 196, 14, 1), 61152)  # alias
        triton_poi_fused_cat_116.run(buf421, buf423, 7827456, grid=grid(7827456), stream=stream0)
        buf425 = empty_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda', dtype=torch.float32)
        buf426 = buf425; del buf425  # reuse
        buf429 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
        buf427 = empty_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda', dtype=torch.float32)
        buf428 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
        buf430 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_117.run(buf426, buf424, primals_379, primals_380, buf429, buf427, buf428, buf430, 624, 25088, grid=grid(624), stream=stream0)
        del primals_379
        del primals_380
        buf431 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
        buf1142 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_118.run(buf424, buf426, buf427, primals_49, primals_50, buf431, buf1142, 15654912, grid=grid(15654912), stream=stream0)
        del primals_50
        buf432 = empty_strided((156, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_119.run(primals_183, buf432, 1404, grid=grid(1404), stream=stream0)
        del primals_183
        buf433 = empty_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_split_with_sizes_120.run(buf431, buf433, 3913728, grid=grid(3913728), stream=stream0)
        buf434 = extern_kernels.convolution(buf433, buf432, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None)
        assert_size_stride(buf434, (128, 156, 14, 14), (30576, 196, 14, 1))
        buf435 = empty_strided((156, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_121.run(primals_184, buf435, 3900, grid=grid(3900), stream=stream0)
        del primals_184
        buf436 = empty_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_split_with_sizes_122.run(buf431, buf436, 3913728, grid=grid(3913728), stream=stream0)
        buf437 = extern_kernels.convolution(buf436, buf435, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None)
        assert_size_stride(buf437, (128, 156, 14, 14), (30576, 196, 14, 1))
        buf438 = empty_strided((156, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_123.run(primals_185, buf438, 7644, grid=grid(7644), stream=stream0)
        del primals_185
        buf439 = empty_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_split_with_sizes_124.run(buf431, buf439, 3913728, grid=grid(3913728), stream=stream0)
        buf440 = extern_kernels.convolution(buf439, buf438, stride=(1, 1), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None)
        assert_size_stride(buf440, (128, 156, 14, 14), (30576, 196, 14, 1))
        buf441 = empty_strided((156, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_125.run(primals_186, buf441, 12636, grid=grid(12636), stream=stream0)
        del primals_186
        buf442 = empty_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_split_with_sizes_126.run(buf431, buf442, 3913728, grid=grid(3913728), stream=stream0)
        buf443 = extern_kernels.convolution(buf442, buf441, stride=(1, 1), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None)
        assert_size_stride(buf443, (128, 156, 14, 14), (30576, 196, 14, 1))
        buf448 = buf431; del buf431  # reuse
        buf444 = as_strided(buf448, (128, 156, 14, 14), (122304, 196, 14, 1))  # alias
        triton_poi_fused_cat_127.run(buf434, buf444, 3913728, grid=grid(3913728), stream=stream0)
        buf445 = as_strided(buf448, (128, 156, 14, 14), (122304, 196, 14, 1), 30576)  # alias
        triton_poi_fused_cat_127.run(buf437, buf445, 3913728, grid=grid(3913728), stream=stream0)
        buf446 = as_strided(buf448, (128, 156, 14, 14), (122304, 196, 14, 1), 61152)  # alias
        triton_poi_fused_cat_127.run(buf440, buf446, 3913728, grid=grid(3913728), stream=stream0)
        buf447 = as_strided(buf448, (128, 156, 14, 14), (122304, 196, 14, 1), 91728)  # alias
        triton_poi_fused_cat_127.run(buf443, buf447, 3913728, grid=grid(3913728), stream=stream0)
        buf449 = buf427; del buf427  # reuse
        buf450 = buf449; del buf449  # reuse
        buf453 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
        buf451 = empty_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda', dtype=torch.float32)
        buf452 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
        buf454 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_117.run(buf450, buf448, primals_382, primals_383, buf453, buf451, buf452, buf454, 624, 25088, grid=grid(624), stream=stream0)
        del primals_382
        del primals_383
        buf455 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
        buf457 = empty_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_per_fused__native_batch_norm_legit_functional_mean_silu_128.run(buf448, buf450, buf451, primals_51, primals_52, buf455, buf457, 79872, 196, grid=grid(79872), stream=stream0)
        del primals_52
        buf458 = empty_strided((26, 624, 1, 1), (624, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_129.run(primals_187, buf458, 16224, grid=grid(16224), stream=stream0)
        del primals_187
        buf459 = empty_strided((26, ), (1, ), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_convolution_130.run(primals_188, buf459, 26, grid=grid(26), stream=stream0)
        del primals_188
        buf460 = extern_kernels.convolution(buf457, buf458, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf460, (128, 26, 1, 1), (26, 1, 1, 1))
        buf461 = buf460; del buf460  # reuse
        buf462 = empty_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_convolution_silu_131.run(buf461, buf459, buf462, 3328, grid=grid(3328), stream=stream0)
        buf463 = empty_strided((624, 26, 1, 1), (26, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_132.run(primals_189, buf463, 16224, grid=grid(16224), stream=stream0)
        del primals_189
        buf464 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_convolution_133.run(primals_190, buf464, 624, grid=grid(624), stream=stream0)
        del primals_190
        buf465 = extern_kernels.convolution(buf462, buf463, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf465, (128, 624, 1, 1), (624, 1, 1, 1))
        buf466 = buf465; del buf465  # reuse
        triton_poi_fused__to_copy_convolution_134.run(buf466, buf464, 79872, grid=grid(79872), stream=stream0)
        buf467 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_mul_sigmoid_silu_135.run(buf455, buf466, buf467, 15654912, grid=grid(15654912), stream=stream0)
        buf468 = buf421; del buf421  # reuse
        triton_poi_fused_split_with_sizes_136.run(buf467, buf468, 7827456, grid=grid(7827456), stream=stream0)
        buf469 = buf419; del buf419  # reuse
        triton_poi_fused_split_with_sizes_137.run(buf467, buf469, 7827456, grid=grid(7827456), stream=stream0)
        buf470 = empty_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_138.run(primals_191, buf470, 16224, grid=grid(16224), stream=stream0)
        del primals_191
        buf471 = extern_kernels.convolution(buf468, buf470, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf471, (128, 52, 14, 14), (10192, 196, 14, 1))
        buf472 = empty_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_138.run(primals_192, buf472, 16224, grid=grid(16224), stream=stream0)
        del primals_192
        buf473 = extern_kernels.convolution(buf469, buf472, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf473, (128, 52, 14, 14), (10192, 196, 14, 1))
        buf476 = empty_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda', dtype=torch.float16)
        buf474 = as_strided(buf476, (128, 52, 14, 14), (20384, 196, 14, 1))  # alias
        triton_poi_fused_cat_139.run(buf471, buf474, 1304576, grid=grid(1304576), stream=stream0)
        del buf471
        buf475 = as_strided(buf476, (128, 52, 14, 14), (20384, 196, 14, 1), 10192)  # alias
        triton_poi_fused_cat_139.run(buf473, buf475, 1304576, grid=grid(1304576), stream=stream0)
        del buf473
        buf477 = buf412; del buf412  # reuse
        triton_red_fused__native_batch_norm_legit_functional_110.run(buf476, buf477, 416, 6272, grid=grid(416), stream=stream0)
        buf478 = buf413; del buf413  # reuse
        buf479 = buf478; del buf478  # reuse
        buf483 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_111.run(buf479, buf477, primals_385, buf483, 104, 4, grid=grid(104), stream=stream0)
        del primals_385
        buf480 = buf477; del buf477  # reuse
        triton_red_fused__native_batch_norm_legit_functional_112.run(buf476, buf479, buf480, 416, 6272, grid=grid(416), stream=stream0)
        buf481 = empty_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda', dtype=torch.float32)
        buf482 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32)
        buf484 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_113.run(buf480, primals_386, buf481, buf482, buf484, 104, 4, grid=grid(104), stream=stream0)
        del primals_386
        buf485 = empty_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_140.run(buf476, buf479, buf481, primals_53, primals_54, buf417, buf485, 2609152, grid=grid(2609152), stream=stream0)
        del primals_54
        buf486 = empty_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_115.run(primals_193, buf486, 16224, grid=grid(16224), stream=stream0)
        del primals_193
        buf487 = extern_kernels.convolution(as_strided(buf485, (128, 52, 14, 14), (20384, 196, 14, 1)), buf486, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf487, (128, 312, 14, 14), (61152, 196, 14, 1))
        buf488 = empty_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_115.run(primals_194, buf488, 16224, grid=grid(16224), stream=stream0)
        del primals_194
        buf489 = extern_kernels.convolution(as_strided(buf485, (128, 52, 14, 14), (20384, 196, 14, 1), 10192), buf488, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf489, (128, 312, 14, 14), (61152, 196, 14, 1))
        buf492 = buf467; del buf467  # reuse
        buf490 = as_strided(buf492, (128, 312, 14, 14), (122304, 196, 14, 1))  # alias
        triton_poi_fused_cat_116.run(buf487, buf490, 7827456, grid=grid(7827456), stream=stream0)
        buf491 = as_strided(buf492, (128, 312, 14, 14), (122304, 196, 14, 1), 61152)  # alias
        triton_poi_fused_cat_116.run(buf489, buf491, 7827456, grid=grid(7827456), stream=stream0)
        buf493 = buf451; del buf451  # reuse
        buf494 = buf493; del buf493  # reuse
        buf497 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
        buf495 = empty_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda', dtype=torch.float32)
        buf496 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
        buf498 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_117.run(buf494, buf492, primals_388, primals_389, buf497, buf495, buf496, buf498, 624, 25088, grid=grid(624), stream=stream0)
        del primals_388
        del primals_389
        buf499 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
        buf1140 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_118.run(buf492, buf494, buf495, primals_55, primals_56, buf499, buf1140, 15654912, grid=grid(15654912), stream=stream0)
        del primals_56
        buf500 = empty_strided((156, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_119.run(primals_195, buf500, 1404, grid=grid(1404), stream=stream0)
        del primals_195
        buf501 = buf443; del buf443  # reuse
        triton_poi_fused_split_with_sizes_120.run(buf499, buf501, 3913728, grid=grid(3913728), stream=stream0)
        buf502 = extern_kernels.convolution(buf501, buf500, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None)
        assert_size_stride(buf502, (128, 156, 14, 14), (30576, 196, 14, 1))
        buf503 = empty_strided((156, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_121.run(primals_196, buf503, 3900, grid=grid(3900), stream=stream0)
        del primals_196
        buf504 = buf440; del buf440  # reuse
        triton_poi_fused_split_with_sizes_122.run(buf499, buf504, 3913728, grid=grid(3913728), stream=stream0)
        buf505 = extern_kernels.convolution(buf504, buf503, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None)
        assert_size_stride(buf505, (128, 156, 14, 14), (30576, 196, 14, 1))
        buf506 = empty_strided((156, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_123.run(primals_197, buf506, 7644, grid=grid(7644), stream=stream0)
        del primals_197
        buf507 = buf437; del buf437  # reuse
        triton_poi_fused_split_with_sizes_124.run(buf499, buf507, 3913728, grid=grid(3913728), stream=stream0)
        buf508 = extern_kernels.convolution(buf507, buf506, stride=(1, 1), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None)
        assert_size_stride(buf508, (128, 156, 14, 14), (30576, 196, 14, 1))
        buf509 = empty_strided((156, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_125.run(primals_198, buf509, 12636, grid=grid(12636), stream=stream0)
        del primals_198
        buf510 = buf434; del buf434  # reuse
        triton_poi_fused_split_with_sizes_126.run(buf499, buf510, 3913728, grid=grid(3913728), stream=stream0)
        buf511 = extern_kernels.convolution(buf510, buf509, stride=(1, 1), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None)
        assert_size_stride(buf511, (128, 156, 14, 14), (30576, 196, 14, 1))
        buf516 = buf499; del buf499  # reuse
        buf512 = as_strided(buf516, (128, 156, 14, 14), (122304, 196, 14, 1))  # alias
        triton_poi_fused_cat_127.run(buf502, buf512, 3913728, grid=grid(3913728), stream=stream0)
        buf513 = as_strided(buf516, (128, 156, 14, 14), (122304, 196, 14, 1), 30576)  # alias
        triton_poi_fused_cat_127.run(buf505, buf513, 3913728, grid=grid(3913728), stream=stream0)
        buf514 = as_strided(buf516, (128, 156, 14, 14), (122304, 196, 14, 1), 61152)  # alias
        triton_poi_fused_cat_127.run(buf508, buf514, 3913728, grid=grid(3913728), stream=stream0)
        buf515 = as_strided(buf516, (128, 156, 14, 14), (122304, 196, 14, 1), 91728)  # alias
        triton_poi_fused_cat_127.run(buf511, buf515, 3913728, grid=grid(3913728), stream=stream0)
        buf517 = buf495; del buf495  # reuse
        buf518 = buf517; del buf517  # reuse
        buf521 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
        buf519 = empty_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda', dtype=torch.float32)
        buf520 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
        buf522 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_117.run(buf518, buf516, primals_391, primals_392, buf521, buf519, buf520, buf522, 624, 25088, grid=grid(624), stream=stream0)
        del primals_391
        del primals_392
        buf523 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
        buf525 = empty_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_per_fused__native_batch_norm_legit_functional_mean_silu_128.run(buf516, buf518, buf519, primals_57, primals_58, buf523, buf525, 79872, 196, grid=grid(79872), stream=stream0)
        del primals_58
        buf526 = empty_strided((26, 624, 1, 1), (624, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_129.run(primals_199, buf526, 16224, grid=grid(16224), stream=stream0)
        del primals_199
        buf527 = buf459; del buf459  # reuse
        triton_poi_fused__to_copy_convolution_130.run(primals_200, buf527, 26, grid=grid(26), stream=stream0)
        del primals_200
        buf528 = extern_kernels.convolution(buf525, buf526, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf528, (128, 26, 1, 1), (26, 1, 1, 1))
        buf529 = buf528; del buf528  # reuse
        buf530 = empty_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_convolution_silu_131.run(buf529, buf527, buf530, 3328, grid=grid(3328), stream=stream0)
        buf531 = empty_strided((624, 26, 1, 1), (26, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_132.run(primals_201, buf531, 16224, grid=grid(16224), stream=stream0)
        del primals_201
        buf532 = buf464; del buf464  # reuse
        triton_poi_fused__to_copy_convolution_133.run(primals_202, buf532, 624, grid=grid(624), stream=stream0)
        del primals_202
        buf533 = extern_kernels.convolution(buf530, buf531, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf533, (128, 624, 1, 1), (624, 1, 1, 1))
        buf534 = buf533; del buf533  # reuse
        triton_poi_fused__to_copy_convolution_134.run(buf534, buf532, 79872, grid=grid(79872), stream=stream0)
        buf535 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_mul_sigmoid_silu_135.run(buf523, buf534, buf535, 15654912, grid=grid(15654912), stream=stream0)
        buf536 = buf489; del buf489  # reuse
        triton_poi_fused_split_with_sizes_136.run(buf535, buf536, 7827456, grid=grid(7827456), stream=stream0)
        buf537 = buf487; del buf487  # reuse
        triton_poi_fused_split_with_sizes_137.run(buf535, buf537, 7827456, grid=grid(7827456), stream=stream0)
        buf538 = empty_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_138.run(primals_203, buf538, 16224, grid=grid(16224), stream=stream0)
        del primals_203
        buf539 = extern_kernels.convolution(buf536, buf538, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf539, (128, 52, 14, 14), (10192, 196, 14, 1))
        buf540 = empty_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_138.run(primals_204, buf540, 16224, grid=grid(16224), stream=stream0)
        del primals_204
        buf541 = extern_kernels.convolution(buf537, buf540, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf541, (128, 52, 14, 14), (10192, 196, 14, 1))
        buf544 = empty_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda', dtype=torch.float16)
        buf542 = as_strided(buf544, (128, 52, 14, 14), (20384, 196, 14, 1))  # alias
        triton_poi_fused_cat_139.run(buf539, buf542, 1304576, grid=grid(1304576), stream=stream0)
        del buf539
        buf543 = as_strided(buf544, (128, 52, 14, 14), (20384, 196, 14, 1), 10192)  # alias
        triton_poi_fused_cat_139.run(buf541, buf543, 1304576, grid=grid(1304576), stream=stream0)
        del buf541
        buf545 = buf480; del buf480  # reuse
        triton_red_fused__native_batch_norm_legit_functional_110.run(buf544, buf545, 416, 6272, grid=grid(416), stream=stream0)
        buf546 = buf481; del buf481  # reuse
        buf547 = buf546; del buf546  # reuse
        buf551 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_111.run(buf547, buf545, primals_394, buf551, 104, 4, grid=grid(104), stream=stream0)
        del primals_394
        buf548 = buf545; del buf545  # reuse
        triton_red_fused__native_batch_norm_legit_functional_112.run(buf544, buf547, buf548, 416, 6272, grid=grid(416), stream=stream0)
        buf549 = empty_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda', dtype=torch.float32)
        buf550 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32)
        buf552 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_113.run(buf548, primals_395, buf549, buf550, buf552, 104, 4, grid=grid(104), stream=stream0)
        del primals_395
        buf553 = empty_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_140.run(buf544, buf547, buf549, primals_59, primals_60, buf485, buf553, 2609152, grid=grid(2609152), stream=stream0)
        del primals_60
        buf554 = empty_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_115.run(primals_205, buf554, 16224, grid=grid(16224), stream=stream0)
        del primals_205
        buf555 = extern_kernels.convolution(as_strided(buf553, (128, 52, 14, 14), (20384, 196, 14, 1)), buf554, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf555, (128, 312, 14, 14), (61152, 196, 14, 1))
        buf556 = empty_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_115.run(primals_206, buf556, 16224, grid=grid(16224), stream=stream0)
        del primals_206
        buf557 = extern_kernels.convolution(as_strided(buf553, (128, 52, 14, 14), (20384, 196, 14, 1), 10192), buf556, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf557, (128, 312, 14, 14), (61152, 196, 14, 1))
        buf560 = buf535; del buf535  # reuse
        buf558 = as_strided(buf560, (128, 312, 14, 14), (122304, 196, 14, 1))  # alias
        triton_poi_fused_cat_116.run(buf555, buf558, 7827456, grid=grid(7827456), stream=stream0)
        buf559 = as_strided(buf560, (128, 312, 14, 14), (122304, 196, 14, 1), 61152)  # alias
        triton_poi_fused_cat_116.run(buf557, buf559, 7827456, grid=grid(7827456), stream=stream0)
        buf561 = buf519; del buf519  # reuse
        buf562 = buf561; del buf561  # reuse
        buf565 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
        buf563 = empty_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda', dtype=torch.float32)
        buf564 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
        buf566 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_117.run(buf562, buf560, primals_397, primals_398, buf565, buf563, buf564, buf566, 624, 25088, grid=grid(624), stream=stream0)
        del primals_397
        del primals_398
        buf567 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
        buf1138 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_118.run(buf560, buf562, buf563, primals_61, primals_62, buf567, buf1138, 15654912, grid=grid(15654912), stream=stream0)
        del primals_62
        buf568 = empty_strided((156, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_119.run(primals_207, buf568, 1404, grid=grid(1404), stream=stream0)
        del primals_207
        buf569 = buf511; del buf511  # reuse
        triton_poi_fused_split_with_sizes_120.run(buf567, buf569, 3913728, grid=grid(3913728), stream=stream0)
        buf570 = extern_kernels.convolution(buf569, buf568, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None)
        assert_size_stride(buf570, (128, 156, 14, 14), (30576, 196, 14, 1))
        buf571 = empty_strided((156, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_121.run(primals_208, buf571, 3900, grid=grid(3900), stream=stream0)
        del primals_208
        buf572 = buf508; del buf508  # reuse
        triton_poi_fused_split_with_sizes_122.run(buf567, buf572, 3913728, grid=grid(3913728), stream=stream0)
        buf573 = extern_kernels.convolution(buf572, buf571, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None)
        assert_size_stride(buf573, (128, 156, 14, 14), (30576, 196, 14, 1))
        buf574 = empty_strided((156, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_123.run(primals_209, buf574, 7644, grid=grid(7644), stream=stream0)
        del primals_209
        buf575 = buf505; del buf505  # reuse
        triton_poi_fused_split_with_sizes_124.run(buf567, buf575, 3913728, grid=grid(3913728), stream=stream0)
        buf576 = extern_kernels.convolution(buf575, buf574, stride=(1, 1), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None)
        assert_size_stride(buf576, (128, 156, 14, 14), (30576, 196, 14, 1))
        buf577 = empty_strided((156, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_125.run(primals_210, buf577, 12636, grid=grid(12636), stream=stream0)
        del primals_210
        buf578 = buf502; del buf502  # reuse
        triton_poi_fused_split_with_sizes_126.run(buf567, buf578, 3913728, grid=grid(3913728), stream=stream0)
        buf579 = extern_kernels.convolution(buf578, buf577, stride=(1, 1), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None)
        assert_size_stride(buf579, (128, 156, 14, 14), (30576, 196, 14, 1))
        buf584 = buf567; del buf567  # reuse
        buf580 = as_strided(buf584, (128, 156, 14, 14), (122304, 196, 14, 1))  # alias
        triton_poi_fused_cat_127.run(buf570, buf580, 3913728, grid=grid(3913728), stream=stream0)
        del buf570
        buf581 = as_strided(buf584, (128, 156, 14, 14), (122304, 196, 14, 1), 30576)  # alias
        triton_poi_fused_cat_127.run(buf573, buf581, 3913728, grid=grid(3913728), stream=stream0)
        del buf573
        buf582 = as_strided(buf584, (128, 156, 14, 14), (122304, 196, 14, 1), 61152)  # alias
        triton_poi_fused_cat_127.run(buf576, buf582, 3913728, grid=grid(3913728), stream=stream0)
        del buf576
        buf583 = as_strided(buf584, (128, 156, 14, 14), (122304, 196, 14, 1), 91728)  # alias
        triton_poi_fused_cat_127.run(buf579, buf583, 3913728, grid=grid(3913728), stream=stream0)
        del buf579
        buf585 = buf563; del buf563  # reuse
        buf586 = buf585; del buf585  # reuse
        buf589 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
        buf587 = empty_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda', dtype=torch.float32)
        buf588 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
        buf590 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_117.run(buf586, buf584, primals_400, primals_401, buf589, buf587, buf588, buf590, 624, 25088, grid=grid(624), stream=stream0)
        del primals_400
        del primals_401
        buf591 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
        buf593 = empty_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_per_fused__native_batch_norm_legit_functional_mean_silu_128.run(buf584, buf586, buf587, primals_63, primals_64, buf591, buf593, 79872, 196, grid=grid(79872), stream=stream0)
        del primals_64
        buf594 = empty_strided((26, 624, 1, 1), (624, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_129.run(primals_211, buf594, 16224, grid=grid(16224), stream=stream0)
        del primals_211
        buf595 = buf527; del buf527  # reuse
        triton_poi_fused__to_copy_convolution_130.run(primals_212, buf595, 26, grid=grid(26), stream=stream0)
        del primals_212
        buf596 = extern_kernels.convolution(buf593, buf594, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf596, (128, 26, 1, 1), (26, 1, 1, 1))
        buf597 = buf596; del buf596  # reuse
        buf598 = empty_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_convolution_silu_131.run(buf597, buf595, buf598, 3328, grid=grid(3328), stream=stream0)
        del buf595
        buf599 = empty_strided((624, 26, 1, 1), (26, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_132.run(primals_213, buf599, 16224, grid=grid(16224), stream=stream0)
        del primals_213
        buf600 = buf532; del buf532  # reuse
        triton_poi_fused__to_copy_convolution_133.run(primals_214, buf600, 624, grid=grid(624), stream=stream0)
        del primals_214
        buf601 = extern_kernels.convolution(buf598, buf599, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf601, (128, 624, 1, 1), (624, 1, 1, 1))
        buf602 = buf601; del buf601  # reuse
        triton_poi_fused__to_copy_convolution_134.run(buf602, buf600, 79872, grid=grid(79872), stream=stream0)
        buf603 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_mul_sigmoid_silu_135.run(buf591, buf602, buf603, 15654912, grid=grid(15654912), stream=stream0)
        buf604 = buf557; del buf557  # reuse
        triton_poi_fused_split_with_sizes_136.run(buf603, buf604, 7827456, grid=grid(7827456), stream=stream0)
        buf605 = buf555; del buf555  # reuse
        triton_poi_fused_split_with_sizes_137.run(buf603, buf605, 7827456, grid=grid(7827456), stream=stream0)
        buf606 = empty_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_138.run(primals_215, buf606, 16224, grid=grid(16224), stream=stream0)
        del primals_215
        buf607 = extern_kernels.convolution(buf604, buf606, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf607, (128, 52, 14, 14), (10192, 196, 14, 1))
        buf608 = empty_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_138.run(primals_216, buf608, 16224, grid=grid(16224), stream=stream0)
        del primals_216
        buf609 = extern_kernels.convolution(buf605, buf608, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf609, (128, 52, 14, 14), (10192, 196, 14, 1))
        buf612 = empty_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda', dtype=torch.float16)
        buf610 = as_strided(buf612, (128, 52, 14, 14), (20384, 196, 14, 1))  # alias
        triton_poi_fused_cat_139.run(buf607, buf610, 1304576, grid=grid(1304576), stream=stream0)
        del buf607
        buf611 = as_strided(buf612, (128, 52, 14, 14), (20384, 196, 14, 1), 10192)  # alias
        triton_poi_fused_cat_139.run(buf609, buf611, 1304576, grid=grid(1304576), stream=stream0)
        del buf609
        buf613 = buf548; del buf548  # reuse
        triton_red_fused__native_batch_norm_legit_functional_110.run(buf612, buf613, 416, 6272, grid=grid(416), stream=stream0)
        buf614 = buf549; del buf549  # reuse
        buf615 = buf614; del buf614  # reuse
        buf619 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_111.run(buf615, buf613, primals_403, buf619, 104, 4, grid=grid(104), stream=stream0)
        del primals_403
        buf616 = buf613; del buf613  # reuse
        triton_red_fused__native_batch_norm_legit_functional_112.run(buf612, buf615, buf616, 416, 6272, grid=grid(416), stream=stream0)
        buf617 = empty_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda', dtype=torch.float32)
        buf618 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32)
        buf620 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_113.run(buf616, primals_404, buf617, buf618, buf620, 104, 4, grid=grid(104), stream=stream0)
        del buf616
        del primals_404
        buf621 = empty_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_140.run(buf612, buf615, buf617, primals_65, primals_66, buf553, buf621, 2609152, grid=grid(2609152), stream=stream0)
        del buf617
        del primals_66
        buf622 = empty_strided((624, 104, 1, 1), (104, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_141.run(primals_217, buf622, 64896, grid=grid(64896), stream=stream0)
        del primals_217
        buf623 = extern_kernels.convolution(buf621, buf622, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf623, (128, 624, 14, 14), (122304, 196, 14, 1))
        buf624 = buf587; del buf587  # reuse
        buf625 = buf624; del buf624  # reuse
        buf628 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
        buf626 = empty_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda', dtype=torch.float32)
        buf627 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
        buf629 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_117.run(buf625, buf623, primals_406, primals_407, buf628, buf626, buf627, buf629, 624, 25088, grid=grid(624), stream=stream0)
        del primals_406
        del primals_407
        buf631 = buf603; del buf603  # reuse
        buf1136 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_silu_sub_142.run(buf623, buf625, buf626, primals_67, primals_68, buf631, buf1136, 15654912, grid=grid(15654912), stream=stream0)
        del primals_68
        buf632 = empty_strided((624, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_143.run(primals_218, buf632, 5616, grid=grid(5616), stream=stream0)
        del primals_218
        buf633 = extern_kernels.convolution(buf631, buf632, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=624, bias=None)
        assert_size_stride(buf633, (128, 624, 14, 14), (122304, 196, 14, 1))
        buf634 = buf626; del buf626  # reuse
        buf635 = buf634; del buf634  # reuse
        buf638 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
        buf636 = empty_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda', dtype=torch.float32)
        buf637 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
        buf639 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_117.run(buf635, buf633, primals_409, primals_410, buf638, buf636, buf637, buf639, 624, 25088, grid=grid(624), stream=stream0)
        del primals_409
        del primals_410
        buf640 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
        buf642 = empty_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_per_fused__native_batch_norm_legit_functional_mean_silu_128.run(buf633, buf635, buf636, primals_69, primals_70, buf640, buf642, 79872, 196, grid=grid(79872), stream=stream0)
        del buf636
        del primals_70
        buf643 = empty_strided((52, 624, 1, 1), (624, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_144.run(primals_219, buf643, 32448, grid=grid(32448), stream=stream0)
        del primals_219
        buf644 = empty_strided((52, ), (1, ), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_convolution_145.run(primals_220, buf644, 52, grid=grid(52), stream=stream0)
        del primals_220
        buf645 = extern_kernels.convolution(buf642, buf643, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf645, (128, 52, 1, 1), (52, 1, 1, 1))
        buf646 = buf645; del buf645  # reuse
        buf647 = empty_strided((128, 52, 1, 1), (52, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_convolution_silu_146.run(buf646, buf644, buf647, 6656, grid=grid(6656), stream=stream0)
        del buf644
        buf648 = empty_strided((624, 52, 1, 1), (52, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_147.run(primals_221, buf648, 32448, grid=grid(32448), stream=stream0)
        del primals_221
        buf649 = buf600; del buf600  # reuse
        triton_poi_fused__to_copy_convolution_133.run(primals_222, buf649, 624, grid=grid(624), stream=stream0)
        del primals_222
        buf650 = extern_kernels.convolution(buf647, buf648, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf650, (128, 624, 1, 1), (624, 1, 1, 1))
        buf651 = buf650; del buf650  # reuse
        triton_poi_fused__to_copy_convolution_134.run(buf651, buf649, 79872, grid=grid(79872), stream=stream0)
        del buf649
        buf652 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_mul_sigmoid_silu_135.run(buf640, buf651, buf652, 15654912, grid=grid(15654912), stream=stream0)
        buf653 = empty_strided((160, 624, 1, 1), (624, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_148.run(primals_223, buf653, 99840, grid=grid(99840), stream=stream0)
        del primals_223
        buf654 = extern_kernels.convolution(buf652, buf653, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf654, (128, 160, 14, 14), (31360, 196, 14, 1))
        buf655 = empty_strided((1, 160, 1, 1, 4), (640, 1, 640, 640, 160), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_149.run(buf654, buf655, 640, 6272, grid=grid(640), stream=stream0)
        buf656 = empty_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda', dtype=torch.float32)
        buf657 = buf656; del buf656  # reuse
        buf661 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_150.run(buf657, buf655, primals_412, buf661, 160, 4, grid=grid(160), stream=stream0)
        del primals_412
        buf658 = buf655; del buf655  # reuse
        triton_red_fused__native_batch_norm_legit_functional_151.run(buf654, buf657, buf658, 640, 6272, grid=grid(640), stream=stream0)
        buf659 = empty_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda', dtype=torch.float32)
        buf660 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
        buf662 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_152.run(buf658, primals_413, buf659, buf660, buf662, 160, 4, grid=grid(160), stream=stream0)
        del primals_413
        buf663 = empty_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_153.run(buf654, buf657, buf659, primals_71, primals_72, buf663, 4014080, grid=grid(4014080), stream=stream0)
        del primals_72
        buf664 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_154.run(primals_224, buf664, 19200, grid=grid(19200), stream=stream0)
        del primals_224
        buf665 = extern_kernels.convolution(as_strided(buf663, (128, 80, 14, 14), (31360, 196, 14, 1)), buf664, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf665, (128, 240, 14, 14), (47040, 196, 14, 1))
        buf666 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_154.run(primals_225, buf666, 19200, grid=grid(19200), stream=stream0)
        del primals_225
        buf667 = extern_kernels.convolution(as_strided(buf663, (128, 80, 14, 14), (31360, 196, 14, 1), 15680), buf666, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf667, (128, 240, 14, 14), (47040, 196, 14, 1))
        buf670 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16)
        buf668 = as_strided(buf670, (128, 240, 14, 14), (94080, 196, 14, 1))  # alias
        triton_poi_fused_cat_155.run(buf665, buf668, 6021120, grid=grid(6021120), stream=stream0)
        buf669 = as_strided(buf670, (128, 240, 14, 14), (94080, 196, 14, 1), 47040)  # alias
        triton_poi_fused_cat_155.run(buf667, buf669, 6021120, grid=grid(6021120), stream=stream0)
        buf671 = as_strided(buf108, (1, 480, 1, 1), (480, 1, 480, 480)); del buf108  # reuse
        buf672 = buf671; del buf671  # reuse
        buf675 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        buf673 = empty_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda', dtype=torch.float32)
        buf674 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        buf676 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_156.run(buf672, buf670, primals_415, primals_416, buf675, buf673, buf674, buf676, 480, 25088, grid=grid(480), stream=stream0)
        del primals_415
        del primals_416
        buf677 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16)
        buf1134 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_157.run(buf670, buf672, buf673, primals_73, primals_74, buf677, buf1134, 12042240, grid=grid(12042240), stream=stream0)
        del primals_74
        buf678 = empty_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_42.run(primals_226, buf678, 1080, grid=grid(1080), stream=stream0)
        del primals_226
        buf679 = empty_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_split_with_sizes_158.run(buf677, buf679, 3010560, grid=grid(3010560), stream=stream0)
        buf680 = extern_kernels.convolution(buf679, buf678, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None)
        assert_size_stride(buf680, (128, 120, 14, 14), (23520, 196, 14, 1))
        buf681 = empty_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_159.run(primals_227, buf681, 3000, grid=grid(3000), stream=stream0)
        del primals_227
        buf682 = empty_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_split_with_sizes_160.run(buf677, buf682, 3010560, grid=grid(3010560), stream=stream0)
        buf683 = extern_kernels.convolution(buf682, buf681, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None)
        assert_size_stride(buf683, (128, 120, 14, 14), (23520, 196, 14, 1))
        buf684 = empty_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_161.run(primals_228, buf684, 5880, grid=grid(5880), stream=stream0)
        del primals_228
        buf685 = empty_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_split_with_sizes_162.run(buf677, buf685, 3010560, grid=grid(3010560), stream=stream0)
        buf686 = extern_kernels.convolution(buf685, buf684, stride=(1, 1), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None)
        assert_size_stride(buf686, (128, 120, 14, 14), (23520, 196, 14, 1))
        buf687 = empty_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_163.run(primals_229, buf687, 9720, grid=grid(9720), stream=stream0)
        del primals_229
        buf688 = empty_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_split_with_sizes_164.run(buf677, buf688, 3010560, grid=grid(3010560), stream=stream0)
        buf689 = extern_kernels.convolution(buf688, buf687, stride=(1, 1), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None)
        assert_size_stride(buf689, (128, 120, 14, 14), (23520, 196, 14, 1))
        buf694 = buf677; del buf677  # reuse
        buf690 = as_strided(buf694, (128, 120, 14, 14), (94080, 196, 14, 1))  # alias
        triton_poi_fused_cat_165.run(buf680, buf690, 3010560, grid=grid(3010560), stream=stream0)
        buf691 = as_strided(buf694, (128, 120, 14, 14), (94080, 196, 14, 1), 23520)  # alias
        triton_poi_fused_cat_165.run(buf683, buf691, 3010560, grid=grid(3010560), stream=stream0)
        buf692 = as_strided(buf694, (128, 120, 14, 14), (94080, 196, 14, 1), 47040)  # alias
        triton_poi_fused_cat_165.run(buf686, buf692, 3010560, grid=grid(3010560), stream=stream0)
        buf693 = as_strided(buf694, (128, 120, 14, 14), (94080, 196, 14, 1), 70560)  # alias
        triton_poi_fused_cat_165.run(buf689, buf693, 3010560, grid=grid(3010560), stream=stream0)
        buf695 = buf673; del buf673  # reuse
        buf696 = buf695; del buf695  # reuse
        buf699 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        buf697 = empty_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda', dtype=torch.float32)
        buf698 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        buf700 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_156.run(buf696, buf694, primals_418, primals_419, buf699, buf697, buf698, buf700, 480, 25088, grid=grid(480), stream=stream0)
        del primals_418
        del primals_419
        buf701 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16)
        buf703 = empty_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_per_fused__native_batch_norm_legit_functional_mean_silu_166.run(buf694, buf696, buf697, primals_75, primals_76, buf701, buf703, 61440, 196, grid=grid(61440), stream=stream0)
        del primals_76
        buf704 = empty_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_167.run(primals_230, buf704, 38400, grid=grid(38400), stream=stream0)
        del primals_230
        buf705 = empty_strided((80, ), (1, ), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_convolution_168.run(primals_231, buf705, 80, grid=grid(80), stream=stream0)
        del primals_231
        buf706 = extern_kernels.convolution(buf703, buf704, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf706, (128, 80, 1, 1), (80, 1, 1, 1))
        buf707 = buf706; del buf706  # reuse
        buf708 = empty_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_convolution_silu_169.run(buf707, buf705, buf708, 10240, grid=grid(10240), stream=stream0)
        buf709 = empty_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_170.run(primals_232, buf709, 38400, grid=grid(38400), stream=stream0)
        del primals_232
        buf710 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_convolution_171.run(primals_233, buf710, 480, grid=grid(480), stream=stream0)
        del primals_233
        buf711 = extern_kernels.convolution(buf708, buf709, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf711, (128, 480, 1, 1), (480, 1, 1, 1))
        buf712 = buf711; del buf711  # reuse
        triton_poi_fused__to_copy_convolution_172.run(buf712, buf710, 61440, grid=grid(61440), stream=stream0)
        buf713 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_mul_sigmoid_silu_173.run(buf701, buf712, buf713, 12042240, grid=grid(12042240), stream=stream0)
        buf714 = buf667; del buf667  # reuse
        triton_poi_fused_split_with_sizes_174.run(buf713, buf714, 6021120, grid=grid(6021120), stream=stream0)
        buf715 = buf665; del buf665  # reuse
        triton_poi_fused_split_with_sizes_175.run(buf713, buf715, 6021120, grid=grid(6021120), stream=stream0)
        buf716 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_176.run(primals_234, buf716, 19200, grid=grid(19200), stream=stream0)
        del primals_234
        buf717 = extern_kernels.convolution(buf714, buf716, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf717, (128, 80, 14, 14), (15680, 196, 14, 1))
        buf718 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_176.run(primals_235, buf718, 19200, grid=grid(19200), stream=stream0)
        del primals_235
        buf719 = extern_kernels.convolution(buf715, buf718, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf719, (128, 80, 14, 14), (15680, 196, 14, 1))
        buf722 = empty_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda', dtype=torch.float16)
        buf720 = as_strided(buf722, (128, 80, 14, 14), (31360, 196, 14, 1))  # alias
        triton_poi_fused_cat_177.run(buf717, buf720, 2007040, grid=grid(2007040), stream=stream0)
        del buf717
        buf721 = as_strided(buf722, (128, 80, 14, 14), (31360, 196, 14, 1), 15680)  # alias
        triton_poi_fused_cat_177.run(buf719, buf721, 2007040, grid=grid(2007040), stream=stream0)
        del buf719
        buf723 = buf658; del buf658  # reuse
        triton_red_fused__native_batch_norm_legit_functional_149.run(buf722, buf723, 640, 6272, grid=grid(640), stream=stream0)
        buf724 = buf659; del buf659  # reuse
        buf725 = buf724; del buf724  # reuse
        buf729 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_150.run(buf725, buf723, primals_421, buf729, 160, 4, grid=grid(160), stream=stream0)
        del primals_421
        buf726 = buf723; del buf723  # reuse
        triton_red_fused__native_batch_norm_legit_functional_151.run(buf722, buf725, buf726, 640, 6272, grid=grid(640), stream=stream0)
        buf727 = empty_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda', dtype=torch.float32)
        buf728 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
        buf730 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_152.run(buf726, primals_422, buf727, buf728, buf730, 160, 4, grid=grid(160), stream=stream0)
        del primals_422
        buf731 = empty_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_178.run(buf722, buf725, buf727, primals_77, primals_78, buf663, buf731, 4014080, grid=grid(4014080), stream=stream0)
        del primals_78
        buf732 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_154.run(primals_236, buf732, 19200, grid=grid(19200), stream=stream0)
        del primals_236
        buf733 = extern_kernels.convolution(as_strided(buf731, (128, 80, 14, 14), (31360, 196, 14, 1)), buf732, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf733, (128, 240, 14, 14), (47040, 196, 14, 1))
        buf734 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_154.run(primals_237, buf734, 19200, grid=grid(19200), stream=stream0)
        del primals_237
        buf735 = extern_kernels.convolution(as_strided(buf731, (128, 80, 14, 14), (31360, 196, 14, 1), 15680), buf734, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf735, (128, 240, 14, 14), (47040, 196, 14, 1))
        buf738 = buf713; del buf713  # reuse
        buf736 = as_strided(buf738, (128, 240, 14, 14), (94080, 196, 14, 1))  # alias
        triton_poi_fused_cat_155.run(buf733, buf736, 6021120, grid=grid(6021120), stream=stream0)
        buf737 = as_strided(buf738, (128, 240, 14, 14), (94080, 196, 14, 1), 47040)  # alias
        triton_poi_fused_cat_155.run(buf735, buf737, 6021120, grid=grid(6021120), stream=stream0)
        buf739 = buf697; del buf697  # reuse
        buf740 = buf739; del buf739  # reuse
        buf743 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        buf741 = empty_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda', dtype=torch.float32)
        buf742 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        buf744 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_156.run(buf740, buf738, primals_424, primals_425, buf743, buf741, buf742, buf744, 480, 25088, grid=grid(480), stream=stream0)
        del primals_424
        del primals_425
        buf745 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16)
        buf1132 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_157.run(buf738, buf740, buf741, primals_79, primals_80, buf745, buf1132, 12042240, grid=grid(12042240), stream=stream0)
        del primals_80
        buf746 = empty_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_42.run(primals_238, buf746, 1080, grid=grid(1080), stream=stream0)
        del primals_238
        buf747 = buf689; del buf689  # reuse
        triton_poi_fused_split_with_sizes_158.run(buf745, buf747, 3010560, grid=grid(3010560), stream=stream0)
        buf748 = extern_kernels.convolution(buf747, buf746, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None)
        assert_size_stride(buf748, (128, 120, 14, 14), (23520, 196, 14, 1))
        buf749 = empty_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_159.run(primals_239, buf749, 3000, grid=grid(3000), stream=stream0)
        del primals_239
        buf750 = buf686; del buf686  # reuse
        triton_poi_fused_split_with_sizes_160.run(buf745, buf750, 3010560, grid=grid(3010560), stream=stream0)
        buf751 = extern_kernels.convolution(buf750, buf749, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None)
        assert_size_stride(buf751, (128, 120, 14, 14), (23520, 196, 14, 1))
        buf752 = empty_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_161.run(primals_240, buf752, 5880, grid=grid(5880), stream=stream0)
        del primals_240
        buf753 = buf683; del buf683  # reuse
        triton_poi_fused_split_with_sizes_162.run(buf745, buf753, 3010560, grid=grid(3010560), stream=stream0)
        buf754 = extern_kernels.convolution(buf753, buf752, stride=(1, 1), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None)
        assert_size_stride(buf754, (128, 120, 14, 14), (23520, 196, 14, 1))
        buf755 = empty_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_163.run(primals_241, buf755, 9720, grid=grid(9720), stream=stream0)
        del primals_241
        buf756 = buf680; del buf680  # reuse
        triton_poi_fused_split_with_sizes_164.run(buf745, buf756, 3010560, grid=grid(3010560), stream=stream0)
        buf757 = extern_kernels.convolution(buf756, buf755, stride=(1, 1), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None)
        assert_size_stride(buf757, (128, 120, 14, 14), (23520, 196, 14, 1))
        buf762 = buf745; del buf745  # reuse
        buf758 = as_strided(buf762, (128, 120, 14, 14), (94080, 196, 14, 1))  # alias
        triton_poi_fused_cat_165.run(buf748, buf758, 3010560, grid=grid(3010560), stream=stream0)
        buf759 = as_strided(buf762, (128, 120, 14, 14), (94080, 196, 14, 1), 23520)  # alias
        triton_poi_fused_cat_165.run(buf751, buf759, 3010560, grid=grid(3010560), stream=stream0)
        buf760 = as_strided(buf762, (128, 120, 14, 14), (94080, 196, 14, 1), 47040)  # alias
        triton_poi_fused_cat_165.run(buf754, buf760, 3010560, grid=grid(3010560), stream=stream0)
        buf761 = as_strided(buf762, (128, 120, 14, 14), (94080, 196, 14, 1), 70560)  # alias
        triton_poi_fused_cat_165.run(buf757, buf761, 3010560, grid=grid(3010560), stream=stream0)
        buf763 = buf741; del buf741  # reuse
        buf764 = buf763; del buf763  # reuse
        buf767 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        buf765 = empty_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda', dtype=torch.float32)
        buf766 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        buf768 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_156.run(buf764, buf762, primals_427, primals_428, buf767, buf765, buf766, buf768, 480, 25088, grid=grid(480), stream=stream0)
        del primals_427
        del primals_428
        buf769 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16)
        buf771 = empty_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_per_fused__native_batch_norm_legit_functional_mean_silu_166.run(buf762, buf764, buf765, primals_81, primals_82, buf769, buf771, 61440, 196, grid=grid(61440), stream=stream0)
        del primals_82
        buf772 = empty_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_167.run(primals_242, buf772, 38400, grid=grid(38400), stream=stream0)
        del primals_242
        buf773 = buf705; del buf705  # reuse
        triton_poi_fused__to_copy_convolution_168.run(primals_243, buf773, 80, grid=grid(80), stream=stream0)
        del primals_243
        buf774 = extern_kernels.convolution(buf771, buf772, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf774, (128, 80, 1, 1), (80, 1, 1, 1))
        buf775 = buf774; del buf774  # reuse
        buf776 = empty_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_convolution_silu_169.run(buf775, buf773, buf776, 10240, grid=grid(10240), stream=stream0)
        buf777 = empty_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_170.run(primals_244, buf777, 38400, grid=grid(38400), stream=stream0)
        del primals_244
        buf778 = buf710; del buf710  # reuse
        triton_poi_fused__to_copy_convolution_171.run(primals_245, buf778, 480, grid=grid(480), stream=stream0)
        del primals_245
        buf779 = extern_kernels.convolution(buf776, buf777, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf779, (128, 480, 1, 1), (480, 1, 1, 1))
        buf780 = buf779; del buf779  # reuse
        triton_poi_fused__to_copy_convolution_172.run(buf780, buf778, 61440, grid=grid(61440), stream=stream0)
        buf781 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_mul_sigmoid_silu_173.run(buf769, buf780, buf781, 12042240, grid=grid(12042240), stream=stream0)
        buf782 = buf735; del buf735  # reuse
        triton_poi_fused_split_with_sizes_174.run(buf781, buf782, 6021120, grid=grid(6021120), stream=stream0)
        buf783 = buf733; del buf733  # reuse
        triton_poi_fused_split_with_sizes_175.run(buf781, buf783, 6021120, grid=grid(6021120), stream=stream0)
        buf784 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_176.run(primals_246, buf784, 19200, grid=grid(19200), stream=stream0)
        del primals_246
        buf785 = extern_kernels.convolution(buf782, buf784, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf785, (128, 80, 14, 14), (15680, 196, 14, 1))
        buf786 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_176.run(primals_247, buf786, 19200, grid=grid(19200), stream=stream0)
        del primals_247
        buf787 = extern_kernels.convolution(buf783, buf786, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf787, (128, 80, 14, 14), (15680, 196, 14, 1))
        buf790 = empty_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda', dtype=torch.float16)
        buf788 = as_strided(buf790, (128, 80, 14, 14), (31360, 196, 14, 1))  # alias
        triton_poi_fused_cat_177.run(buf785, buf788, 2007040, grid=grid(2007040), stream=stream0)
        del buf785
        buf789 = as_strided(buf790, (128, 80, 14, 14), (31360, 196, 14, 1), 15680)  # alias
        triton_poi_fused_cat_177.run(buf787, buf789, 2007040, grid=grid(2007040), stream=stream0)
        del buf787
        buf791 = buf726; del buf726  # reuse
        triton_red_fused__native_batch_norm_legit_functional_149.run(buf790, buf791, 640, 6272, grid=grid(640), stream=stream0)
        buf792 = buf727; del buf727  # reuse
        buf793 = buf792; del buf792  # reuse
        buf797 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_150.run(buf793, buf791, primals_430, buf797, 160, 4, grid=grid(160), stream=stream0)
        del primals_430
        buf794 = buf791; del buf791  # reuse
        triton_red_fused__native_batch_norm_legit_functional_151.run(buf790, buf793, buf794, 640, 6272, grid=grid(640), stream=stream0)
        buf795 = empty_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda', dtype=torch.float32)
        buf796 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
        buf798 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_152.run(buf794, primals_431, buf795, buf796, buf798, 160, 4, grid=grid(160), stream=stream0)
        del primals_431
        buf799 = empty_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_178.run(buf790, buf793, buf795, primals_83, primals_84, buf731, buf799, 4014080, grid=grid(4014080), stream=stream0)
        del primals_84
        buf800 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_154.run(primals_248, buf800, 19200, grid=grid(19200), stream=stream0)
        del primals_248
        buf801 = extern_kernels.convolution(as_strided(buf799, (128, 80, 14, 14), (31360, 196, 14, 1)), buf800, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf801, (128, 240, 14, 14), (47040, 196, 14, 1))
        buf802 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_154.run(primals_249, buf802, 19200, grid=grid(19200), stream=stream0)
        del primals_249
        buf803 = extern_kernels.convolution(as_strided(buf799, (128, 80, 14, 14), (31360, 196, 14, 1), 15680), buf802, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf803, (128, 240, 14, 14), (47040, 196, 14, 1))
        buf806 = buf781; del buf781  # reuse
        buf804 = as_strided(buf806, (128, 240, 14, 14), (94080, 196, 14, 1))  # alias
        triton_poi_fused_cat_155.run(buf801, buf804, 6021120, grid=grid(6021120), stream=stream0)
        buf805 = as_strided(buf806, (128, 240, 14, 14), (94080, 196, 14, 1), 47040)  # alias
        triton_poi_fused_cat_155.run(buf803, buf805, 6021120, grid=grid(6021120), stream=stream0)
        buf807 = buf765; del buf765  # reuse
        buf808 = buf807; del buf807  # reuse
        buf811 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        buf809 = empty_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda', dtype=torch.float32)
        buf810 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        buf812 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_156.run(buf808, buf806, primals_433, primals_434, buf811, buf809, buf810, buf812, 480, 25088, grid=grid(480), stream=stream0)
        del primals_433
        del primals_434
        buf813 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16)
        buf1130 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_157.run(buf806, buf808, buf809, primals_85, primals_86, buf813, buf1130, 12042240, grid=grid(12042240), stream=stream0)
        del primals_86
        buf814 = empty_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_42.run(primals_250, buf814, 1080, grid=grid(1080), stream=stream0)
        del primals_250
        buf815 = buf757; del buf757  # reuse
        triton_poi_fused_split_with_sizes_158.run(buf813, buf815, 3010560, grid=grid(3010560), stream=stream0)
        buf816 = extern_kernels.convolution(buf815, buf814, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None)
        assert_size_stride(buf816, (128, 120, 14, 14), (23520, 196, 14, 1))
        buf817 = empty_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_159.run(primals_251, buf817, 3000, grid=grid(3000), stream=stream0)
        del primals_251
        buf818 = buf754; del buf754  # reuse
        triton_poi_fused_split_with_sizes_160.run(buf813, buf818, 3010560, grid=grid(3010560), stream=stream0)
        buf819 = extern_kernels.convolution(buf818, buf817, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None)
        assert_size_stride(buf819, (128, 120, 14, 14), (23520, 196, 14, 1))
        buf820 = empty_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_161.run(primals_252, buf820, 5880, grid=grid(5880), stream=stream0)
        del primals_252
        buf821 = buf751; del buf751  # reuse
        triton_poi_fused_split_with_sizes_162.run(buf813, buf821, 3010560, grid=grid(3010560), stream=stream0)
        buf822 = extern_kernels.convolution(buf821, buf820, stride=(1, 1), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None)
        assert_size_stride(buf822, (128, 120, 14, 14), (23520, 196, 14, 1))
        buf823 = empty_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_163.run(primals_253, buf823, 9720, grid=grid(9720), stream=stream0)
        del primals_253
        buf824 = buf748; del buf748  # reuse
        triton_poi_fused_split_with_sizes_164.run(buf813, buf824, 3010560, grid=grid(3010560), stream=stream0)
        buf825 = extern_kernels.convolution(buf824, buf823, stride=(1, 1), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None)
        assert_size_stride(buf825, (128, 120, 14, 14), (23520, 196, 14, 1))
        buf830 = buf813; del buf813  # reuse
        buf826 = as_strided(buf830, (128, 120, 14, 14), (94080, 196, 14, 1))  # alias
        triton_poi_fused_cat_165.run(buf816, buf826, 3010560, grid=grid(3010560), stream=stream0)
        del buf816
        buf827 = as_strided(buf830, (128, 120, 14, 14), (94080, 196, 14, 1), 23520)  # alias
        triton_poi_fused_cat_165.run(buf819, buf827, 3010560, grid=grid(3010560), stream=stream0)
        del buf819
        buf828 = as_strided(buf830, (128, 120, 14, 14), (94080, 196, 14, 1), 47040)  # alias
        triton_poi_fused_cat_165.run(buf822, buf828, 3010560, grid=grid(3010560), stream=stream0)
        del buf822
        buf829 = as_strided(buf830, (128, 120, 14, 14), (94080, 196, 14, 1), 70560)  # alias
        triton_poi_fused_cat_165.run(buf825, buf829, 3010560, grid=grid(3010560), stream=stream0)
        del buf825
        buf831 = buf809; del buf809  # reuse
        buf832 = buf831; del buf831  # reuse
        buf835 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        buf833 = empty_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda', dtype=torch.float32)
        buf834 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        buf836 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_156.run(buf832, buf830, primals_436, primals_437, buf835, buf833, buf834, buf836, 480, 25088, grid=grid(480), stream=stream0)
        del primals_436
        del primals_437
        buf837 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16)
        buf839 = empty_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_per_fused__native_batch_norm_legit_functional_mean_silu_166.run(buf830, buf832, buf833, primals_87, primals_88, buf837, buf839, 61440, 196, grid=grid(61440), stream=stream0)
        del buf833
        del primals_88
        buf840 = empty_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_167.run(primals_254, buf840, 38400, grid=grid(38400), stream=stream0)
        del primals_254
        buf841 = buf773; del buf773  # reuse
        triton_poi_fused__to_copy_convolution_168.run(primals_255, buf841, 80, grid=grid(80), stream=stream0)
        del primals_255
        buf842 = extern_kernels.convolution(buf839, buf840, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf842, (128, 80, 1, 1), (80, 1, 1, 1))
        buf843 = buf842; del buf842  # reuse
        buf844 = empty_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_convolution_silu_169.run(buf843, buf841, buf844, 10240, grid=grid(10240), stream=stream0)
        buf845 = empty_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_170.run(primals_256, buf845, 38400, grid=grid(38400), stream=stream0)
        del primals_256
        buf846 = buf778; del buf778  # reuse
        triton_poi_fused__to_copy_convolution_171.run(primals_257, buf846, 480, grid=grid(480), stream=stream0)
        del primals_257
        buf847 = extern_kernels.convolution(buf844, buf845, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf847, (128, 480, 1, 1), (480, 1, 1, 1))
        buf848 = buf847; del buf847  # reuse
        triton_poi_fused__to_copy_convolution_172.run(buf848, buf846, 61440, grid=grid(61440), stream=stream0)
        del buf846
        buf849 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_mul_sigmoid_silu_173.run(buf837, buf848, buf849, 12042240, grid=grid(12042240), stream=stream0)
        buf850 = buf803; del buf803  # reuse
        triton_poi_fused_split_with_sizes_174.run(buf849, buf850, 6021120, grid=grid(6021120), stream=stream0)
        buf851 = buf801; del buf801  # reuse
        triton_poi_fused_split_with_sizes_175.run(buf849, buf851, 6021120, grid=grid(6021120), stream=stream0)
        del buf849
        buf852 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_176.run(primals_258, buf852, 19200, grid=grid(19200), stream=stream0)
        del primals_258
        buf853 = extern_kernels.convolution(buf850, buf852, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf853, (128, 80, 14, 14), (15680, 196, 14, 1))
        buf854 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_176.run(primals_259, buf854, 19200, grid=grid(19200), stream=stream0)
        del primals_259
        buf855 = extern_kernels.convolution(buf851, buf854, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf855, (128, 80, 14, 14), (15680, 196, 14, 1))
        buf858 = empty_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda', dtype=torch.float16)
        buf856 = as_strided(buf858, (128, 80, 14, 14), (31360, 196, 14, 1))  # alias
        triton_poi_fused_cat_177.run(buf853, buf856, 2007040, grid=grid(2007040), stream=stream0)
        del buf853
        buf857 = as_strided(buf858, (128, 80, 14, 14), (31360, 196, 14, 1), 15680)  # alias
        triton_poi_fused_cat_177.run(buf855, buf857, 2007040, grid=grid(2007040), stream=stream0)
        del buf855
        buf859 = buf794; del buf794  # reuse
        triton_red_fused__native_batch_norm_legit_functional_149.run(buf858, buf859, 640, 6272, grid=grid(640), stream=stream0)
        buf860 = buf795; del buf795  # reuse
        buf861 = buf860; del buf860  # reuse
        buf865 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_150.run(buf861, buf859, primals_439, buf865, 160, 4, grid=grid(160), stream=stream0)
        del primals_439
        buf862 = buf859; del buf859  # reuse
        triton_red_fused__native_batch_norm_legit_functional_151.run(buf858, buf861, buf862, 640, 6272, grid=grid(640), stream=stream0)
        buf863 = empty_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda', dtype=torch.float32)
        buf864 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
        buf866 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
        triton_per_fused__native_batch_norm_legit_functional_152.run(buf862, primals_440, buf863, buf864, buf866, 160, 4, grid=grid(160), stream=stream0)
        del buf862
        del primals_440
        buf867 = empty_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_178.run(buf858, buf861, buf863, primals_89, primals_90, buf799, buf867, 4014080, grid=grid(4014080), stream=stream0)
        del buf863
        del primals_90
        buf868 = empty_strided((960, 160, 1, 1), (160, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_179.run(primals_260, buf868, 153600, grid=grid(153600), stream=stream0)
        del primals_260
        buf869 = extern_kernels.convolution(buf867, buf868, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf869, (128, 960, 14, 14), (188160, 196, 14, 1))
        buf870 = empty_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda', dtype=torch.float32)
        buf871 = buf870; del buf870  # reuse
        buf874 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32)
        buf872 = empty_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda', dtype=torch.float32)
        buf873 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32)
        buf875 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_180.run(buf871, buf869, primals_442, primals_443, buf874, buf872, buf873, buf875, 960, 25088, grid=grid(960), stream=stream0)
        del primals_442
        del primals_443
        buf876 = empty_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda', dtype=torch.float16)
        buf1128 = empty_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_181.run(buf869, buf871, buf872, primals_91, primals_92, buf876, buf1128, 24084480, grid=grid(24084480), stream=stream0)
        del primals_92
        buf877 = empty_strided((240, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_182.run(primals_261, buf877, 2160, grid=grid(2160), stream=stream0)
        del primals_261
        buf878 = as_strided(buf150, (128, 240, 14, 14), (47040, 196, 14, 1)); del buf150  # reuse
        triton_poi_fused_split_with_sizes_183.run(buf876, buf878, 6021120, grid=grid(6021120), stream=stream0)
        buf879 = extern_kernels.convolution(buf878, buf877, stride=(2, 2), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=240, bias=None)
        assert_size_stride(buf879, (128, 240, 7, 7), (11760, 49, 7, 1))
        buf880 = empty_strided((240, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_184.run(primals_262, buf880, 6000, grid=grid(6000), stream=stream0)
        del primals_262
        buf881 = as_strided(buf147, (128, 240, 14, 14), (47040, 196, 14, 1)); del buf147  # reuse
        triton_poi_fused_split_with_sizes_185.run(buf876, buf881, 6021120, grid=grid(6021120), stream=stream0)
        buf882 = extern_kernels.convolution(buf881, buf880, stride=(2, 2), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=240, bias=None)
        assert_size_stride(buf882, (128, 240, 7, 7), (11760, 49, 7, 1))
        buf883 = empty_strided((240, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_186.run(primals_263, buf883, 11760, grid=grid(11760), stream=stream0)
        del primals_263
        buf884 = as_strided(buf144, (128, 240, 14, 14), (47040, 196, 14, 1)); del buf144  # reuse
        triton_poi_fused_split_with_sizes_187.run(buf876, buf884, 6021120, grid=grid(6021120), stream=stream0)
        buf885 = extern_kernels.convolution(buf884, buf883, stride=(2, 2), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=240, bias=None)
        assert_size_stride(buf885, (128, 240, 7, 7), (11760, 49, 7, 1))
        buf886 = empty_strided((240, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_188.run(primals_264, buf886, 19440, grid=grid(19440), stream=stream0)
        del primals_264
        buf887 = as_strided(buf141, (128, 240, 14, 14), (47040, 196, 14, 1)); del buf141  # reuse
        triton_poi_fused_split_with_sizes_189.run(buf876, buf887, 6021120, grid=grid(6021120), stream=stream0)
        del buf876
        buf888 = extern_kernels.convolution(buf887, buf886, stride=(2, 2), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=240, bias=None)
        assert_size_stride(buf888, (128, 240, 7, 7), (11760, 49, 7, 1))
        buf893 = empty_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda', dtype=torch.float16)
        buf889 = as_strided(buf893, (128, 240, 7, 7), (47040, 49, 7, 1))  # alias
        triton_poi_fused_cat_190.run(buf879, buf889, 1505280, grid=grid(1505280), stream=stream0)
        del buf879
        buf890 = as_strided(buf893, (128, 240, 7, 7), (47040, 49, 7, 1), 11760)  # alias
        triton_poi_fused_cat_190.run(buf882, buf890, 1505280, grid=grid(1505280), stream=stream0)
        del buf882
        buf891 = as_strided(buf893, (128, 240, 7, 7), (47040, 49, 7, 1), 23520)  # alias
        triton_poi_fused_cat_190.run(buf885, buf891, 1505280, grid=grid(1505280), stream=stream0)
        del buf885
        buf892 = as_strided(buf893, (128, 240, 7, 7), (47040, 49, 7, 1), 35280)  # alias
        triton_poi_fused_cat_190.run(buf888, buf892, 1505280, grid=grid(1505280), stream=stream0)
        del buf888
        buf894 = buf872; del buf872  # reuse
        buf895 = buf894; del buf894  # reuse
        buf898 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32)
        buf896 = empty_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda', dtype=torch.float32)
        buf897 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32)
        buf899 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_191.run(buf895, buf893, primals_445, primals_446, buf898, buf896, buf897, buf899, 960, 6272, grid=grid(960), stream=stream0)
        del primals_445
        del primals_446
        buf900 = empty_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda', dtype=torch.float16)
        buf902 = empty_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_per_fused__native_batch_norm_legit_functional_mean_silu_192.run(buf893, buf895, buf896, primals_93, primals_94, buf900, buf902, 122880, 49, grid=grid(122880), stream=stream0)
        del buf896
        del primals_94
        buf903 = empty_strided((80, 960, 1, 1), (960, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_193.run(primals_265, buf903, 76800, grid=grid(76800), stream=stream0)
        del primals_265
        buf904 = buf841; del buf841  # reuse
        triton_poi_fused__to_copy_convolution_168.run(primals_266, buf904, 80, grid=grid(80), stream=stream0)
        del primals_266
        buf905 = extern_kernels.convolution(buf902, buf903, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf905, (128, 80, 1, 1), (80, 1, 1, 1))
        buf906 = buf905; del buf905  # reuse
        buf907 = empty_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_convolution_silu_169.run(buf906, buf904, buf907, 10240, grid=grid(10240), stream=stream0)
        del buf904
        buf908 = empty_strided((960, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_194.run(primals_267, buf908, 76800, grid=grid(76800), stream=stream0)
        del primals_267
        buf909 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_convolution_195.run(primals_268, buf909, 960, grid=grid(960), stream=stream0)
        del primals_268
        buf910 = extern_kernels.convolution(buf907, buf908, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf910, (128, 960, 1, 1), (960, 1, 1, 1))
        buf911 = buf910; del buf910  # reuse
        triton_poi_fused__to_copy_convolution_196.run(buf911, buf909, 122880, grid=grid(122880), stream=stream0)
        del buf909
        buf912 = empty_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_mul_sigmoid_silu_197.run(buf900, buf911, buf912, 6021120, grid=grid(6021120), stream=stream0)
        buf913 = empty_strided((264, 960, 1, 1), (960, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_198.run(primals_269, buf913, 253440, grid=grid(253440), stream=stream0)
        del primals_269
        buf914 = extern_kernels.convolution(buf912, buf913, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf914, (128, 264, 7, 7), (12936, 49, 7, 1))
        buf915 = empty_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda', dtype=torch.float32)
        buf916 = buf915; del buf915  # reuse
        buf919 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
        buf917 = empty_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda', dtype=torch.float32)
        buf918 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
        buf920 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_199.run(buf916, buf914, primals_448, primals_449, buf919, buf917, buf918, buf920, 264, 6272, grid=grid(264), stream=stream0)
        del primals_448
        del primals_449
        buf921 = empty_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_200.run(buf914, buf916, buf917, primals_95, primals_96, buf921, 1655808, grid=grid(1655808), stream=stream0)
        del primals_96
        buf922 = empty_strided((1584, 264, 1, 1), (264, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_201.run(primals_270, buf922, 418176, grid=grid(418176), stream=stream0)
        del primals_270
        buf923 = extern_kernels.convolution(buf921, buf922, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf923, (128, 1584, 7, 7), (77616, 49, 7, 1))
        buf924 = empty_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda', dtype=torch.float32)
        buf925 = buf924; del buf924  # reuse
        buf928 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        buf926 = empty_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda', dtype=torch.float32)
        buf927 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        buf929 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_202.run(buf925, buf923, primals_451, primals_452, buf928, buf926, buf927, buf929, 1584, 6272, grid=grid(1584), stream=stream0)
        del primals_451
        del primals_452
        buf930 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16)
        buf1126 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_203.run(buf923, buf925, buf926, primals_97, primals_98, buf930, buf1126, 9934848, grid=grid(9934848), stream=stream0)
        del primals_98
        buf931 = empty_strided((396, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_204.run(primals_271, buf931, 3564, grid=grid(3564), stream=stream0)
        del primals_271
        buf932 = empty_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_split_with_sizes_205.run(buf930, buf932, 2483712, grid=grid(2483712), stream=stream0)
        buf933 = extern_kernels.convolution(buf932, buf931, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None)
        assert_size_stride(buf933, (128, 396, 7, 7), (19404, 49, 7, 1))
        buf934 = empty_strided((396, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_206.run(primals_272, buf934, 9900, grid=grid(9900), stream=stream0)
        del primals_272
        buf935 = empty_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_split_with_sizes_207.run(buf930, buf935, 2483712, grid=grid(2483712), stream=stream0)
        buf936 = extern_kernels.convolution(buf935, buf934, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None)
        assert_size_stride(buf936, (128, 396, 7, 7), (19404, 49, 7, 1))
        buf937 = empty_strided((396, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_208.run(primals_273, buf937, 19404, grid=grid(19404), stream=stream0)
        del primals_273
        buf938 = empty_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_split_with_sizes_209.run(buf930, buf938, 2483712, grid=grid(2483712), stream=stream0)
        buf939 = extern_kernels.convolution(buf938, buf937, stride=(1, 1), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None)
        assert_size_stride(buf939, (128, 396, 7, 7), (19404, 49, 7, 1))
        buf940 = empty_strided((396, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_210.run(primals_274, buf940, 32076, grid=grid(32076), stream=stream0)
        del primals_274
        buf941 = empty_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_split_with_sizes_211.run(buf930, buf941, 2483712, grid=grid(2483712), stream=stream0)
        buf942 = extern_kernels.convolution(buf941, buf940, stride=(1, 1), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None)
        assert_size_stride(buf942, (128, 396, 7, 7), (19404, 49, 7, 1))
        buf947 = buf930; del buf930  # reuse
        buf943 = as_strided(buf947, (128, 396, 7, 7), (77616, 49, 7, 1))  # alias
        triton_poi_fused_cat_212.run(buf933, buf943, 2483712, grid=grid(2483712), stream=stream0)
        buf944 = as_strided(buf947, (128, 396, 7, 7), (77616, 49, 7, 1), 19404)  # alias
        triton_poi_fused_cat_213.run(buf936, buf944, 2483712, grid=grid(2483712), stream=stream0)
        buf945 = as_strided(buf947, (128, 396, 7, 7), (77616, 49, 7, 1), 38808)  # alias
        triton_poi_fused_cat_213.run(buf939, buf945, 2483712, grid=grid(2483712), stream=stream0)
        buf946 = as_strided(buf947, (128, 396, 7, 7), (77616, 49, 7, 1), 58212)  # alias
        triton_poi_fused_cat_213.run(buf942, buf946, 2483712, grid=grid(2483712), stream=stream0)
        buf948 = buf926; del buf926  # reuse
        buf949 = buf948; del buf948  # reuse
        buf952 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        buf950 = empty_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda', dtype=torch.float32)
        buf951 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        buf953 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_202.run(buf949, buf947, primals_454, primals_455, buf952, buf950, buf951, buf953, 1584, 6272, grid=grid(1584), stream=stream0)
        del primals_454
        del primals_455
        buf954 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16)
        buf956 = empty_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_per_fused__native_batch_norm_legit_functional_mean_silu_214.run(buf947, buf949, buf950, primals_99, primals_100, buf954, buf956, 202752, 49, grid=grid(202752), stream=stream0)
        del primals_100
        buf957 = empty_strided((132, 1584, 1, 1), (1584, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_215.run(primals_275, buf957, 209088, grid=grid(209088), stream=stream0)
        del primals_275
        buf958 = empty_strided((132, ), (1, ), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_convolution_216.run(primals_276, buf958, 132, grid=grid(132), stream=stream0)
        del primals_276
        buf959 = extern_kernels.convolution(buf956, buf957, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf959, (128, 132, 1, 1), (132, 1, 1, 1))
        buf960 = buf959; del buf959  # reuse
        buf961 = empty_strided((128, 132, 1, 1), (132, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_convolution_silu_217.run(buf960, buf958, buf961, 16896, grid=grid(16896), stream=stream0)
        buf962 = empty_strided((1584, 132, 1, 1), (132, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_218.run(primals_277, buf962, 209088, grid=grid(209088), stream=stream0)
        del primals_277
        buf963 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_convolution_219.run(primals_278, buf963, 1584, grid=grid(1584), stream=stream0)
        del primals_278
        buf964 = extern_kernels.convolution(buf961, buf962, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf964, (128, 1584, 1, 1), (1584, 1, 1, 1))
        buf965 = buf964; del buf964  # reuse
        triton_poi_fused__to_copy_convolution_220.run(buf965, buf963, 202752, grid=grid(202752), stream=stream0)
        buf966 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_mul_sigmoid_silu_221.run(buf954, buf965, buf966, 9934848, grid=grid(9934848), stream=stream0)
        buf967 = empty_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_split_with_sizes_222.run(buf966, buf967, 4967424, grid=grid(4967424), stream=stream0)
        buf968 = empty_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_split_with_sizes_223.run(buf966, buf968, 4967424, grid=grid(4967424), stream=stream0)
        buf969 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_224.run(primals_279, buf969, 104544, grid=grid(104544), stream=stream0)
        del primals_279
        buf970 = extern_kernels.convolution(buf967, buf969, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf970, (128, 132, 7, 7), (6468, 49, 7, 1))
        buf971 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_224.run(primals_280, buf971, 104544, grid=grid(104544), stream=stream0)
        del primals_280
        buf972 = extern_kernels.convolution(buf968, buf971, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf972, (128, 132, 7, 7), (6468, 49, 7, 1))
        buf975 = empty_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda', dtype=torch.float16)
        buf973 = as_strided(buf975, (128, 132, 7, 7), (12936, 49, 7, 1))  # alias
        triton_poi_fused_cat_225.run(buf970, buf973, 827904, grid=grid(827904), stream=stream0)
        del buf970
        buf974 = as_strided(buf975, (128, 132, 7, 7), (12936, 49, 7, 1), 6468)  # alias
        triton_poi_fused_cat_226.run(buf972, buf974, 827904, grid=grid(827904), stream=stream0)
        del buf972
        buf976 = buf917; del buf917  # reuse
        buf977 = buf976; del buf976  # reuse
        buf980 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
        buf978 = empty_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda', dtype=torch.float32)
        buf979 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
        buf981 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_199.run(buf977, buf975, primals_457, primals_458, buf980, buf978, buf979, buf981, 264, 6272, grid=grid(264), stream=stream0)
        del primals_457
        del primals_458
        buf982 = empty_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_227.run(buf975, buf977, buf978, primals_101, primals_102, buf921, buf982, 1655808, grid=grid(1655808), stream=stream0)
        del primals_102
        buf983 = empty_strided((1584, 264, 1, 1), (264, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_201.run(primals_281, buf983, 418176, grid=grid(418176), stream=stream0)
        del primals_281
        buf984 = extern_kernels.convolution(buf982, buf983, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf984, (128, 1584, 7, 7), (77616, 49, 7, 1))
        buf985 = buf950; del buf950  # reuse
        buf986 = buf985; del buf985  # reuse
        buf989 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        buf987 = empty_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda', dtype=torch.float32)
        buf988 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        buf990 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_202.run(buf986, buf984, primals_460, primals_461, buf989, buf987, buf988, buf990, 1584, 6272, grid=grid(1584), stream=stream0)
        del primals_460
        del primals_461
        buf991 = buf966; del buf966  # reuse
        buf1124 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_203.run(buf984, buf986, buf987, primals_103, primals_104, buf991, buf1124, 9934848, grid=grid(9934848), stream=stream0)
        del primals_104
        buf992 = empty_strided((396, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_204.run(primals_282, buf992, 3564, grid=grid(3564), stream=stream0)
        del primals_282
        buf993 = buf942; del buf942  # reuse
        triton_poi_fused_split_with_sizes_205.run(buf991, buf993, 2483712, grid=grid(2483712), stream=stream0)
        buf994 = extern_kernels.convolution(buf993, buf992, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None)
        assert_size_stride(buf994, (128, 396, 7, 7), (19404, 49, 7, 1))
        buf995 = empty_strided((396, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_206.run(primals_283, buf995, 9900, grid=grid(9900), stream=stream0)
        del primals_283
        buf996 = buf939; del buf939  # reuse
        triton_poi_fused_split_with_sizes_207.run(buf991, buf996, 2483712, grid=grid(2483712), stream=stream0)
        buf997 = extern_kernels.convolution(buf996, buf995, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None)
        assert_size_stride(buf997, (128, 396, 7, 7), (19404, 49, 7, 1))
        buf998 = empty_strided((396, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_208.run(primals_284, buf998, 19404, grid=grid(19404), stream=stream0)
        del primals_284
        buf999 = buf936; del buf936  # reuse
        triton_poi_fused_split_with_sizes_209.run(buf991, buf999, 2483712, grid=grid(2483712), stream=stream0)
        buf1000 = extern_kernels.convolution(buf999, buf998, stride=(1, 1), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None)
        assert_size_stride(buf1000, (128, 396, 7, 7), (19404, 49, 7, 1))
        buf1001 = empty_strided((396, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_210.run(primals_285, buf1001, 32076, grid=grid(32076), stream=stream0)
        del primals_285
        buf1002 = buf933; del buf933  # reuse
        triton_poi_fused_split_with_sizes_211.run(buf991, buf1002, 2483712, grid=grid(2483712), stream=stream0)
        buf1003 = extern_kernels.convolution(buf1002, buf1001, stride=(1, 1), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None)
        assert_size_stride(buf1003, (128, 396, 7, 7), (19404, 49, 7, 1))
        buf1008 = buf991; del buf991  # reuse
        buf1004 = as_strided(buf1008, (128, 396, 7, 7), (77616, 49, 7, 1))  # alias
        triton_poi_fused_cat_212.run(buf994, buf1004, 2483712, grid=grid(2483712), stream=stream0)
        buf1005 = as_strided(buf1008, (128, 396, 7, 7), (77616, 49, 7, 1), 19404)  # alias
        triton_poi_fused_cat_213.run(buf997, buf1005, 2483712, grid=grid(2483712), stream=stream0)
        buf1006 = as_strided(buf1008, (128, 396, 7, 7), (77616, 49, 7, 1), 38808)  # alias
        triton_poi_fused_cat_213.run(buf1000, buf1006, 2483712, grid=grid(2483712), stream=stream0)
        buf1007 = as_strided(buf1008, (128, 396, 7, 7), (77616, 49, 7, 1), 58212)  # alias
        triton_poi_fused_cat_213.run(buf1003, buf1007, 2483712, grid=grid(2483712), stream=stream0)
        buf1009 = buf987; del buf987  # reuse
        buf1010 = buf1009; del buf1009  # reuse
        buf1013 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        buf1011 = empty_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda', dtype=torch.float32)
        buf1012 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        buf1014 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_202.run(buf1010, buf1008, primals_463, primals_464, buf1013, buf1011, buf1012, buf1014, 1584, 6272, grid=grid(1584), stream=stream0)
        del primals_463
        del primals_464
        buf1015 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16)
        buf1017 = empty_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_per_fused__native_batch_norm_legit_functional_mean_silu_214.run(buf1008, buf1010, buf1011, primals_105, primals_106, buf1015, buf1017, 202752, 49, grid=grid(202752), stream=stream0)
        del primals_106
        buf1018 = empty_strided((132, 1584, 1, 1), (1584, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_215.run(primals_286, buf1018, 209088, grid=grid(209088), stream=stream0)
        del primals_286
        buf1019 = buf958; del buf958  # reuse
        triton_poi_fused__to_copy_convolution_216.run(primals_287, buf1019, 132, grid=grid(132), stream=stream0)
        del primals_287
        buf1020 = extern_kernels.convolution(buf1017, buf1018, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf1020, (128, 132, 1, 1), (132, 1, 1, 1))
        buf1021 = buf1020; del buf1020  # reuse
        buf1022 = empty_strided((128, 132, 1, 1), (132, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_convolution_silu_217.run(buf1021, buf1019, buf1022, 16896, grid=grid(16896), stream=stream0)
        buf1023 = empty_strided((1584, 132, 1, 1), (132, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_218.run(primals_288, buf1023, 209088, grid=grid(209088), stream=stream0)
        del primals_288
        buf1024 = buf963; del buf963  # reuse
        triton_poi_fused__to_copy_convolution_219.run(primals_289, buf1024, 1584, grid=grid(1584), stream=stream0)
        del primals_289
        buf1025 = extern_kernels.convolution(buf1022, buf1023, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf1025, (128, 1584, 1, 1), (1584, 1, 1, 1))
        buf1026 = buf1025; del buf1025  # reuse
        triton_poi_fused__to_copy_convolution_220.run(buf1026, buf1024, 202752, grid=grid(202752), stream=stream0)
        buf1027 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_mul_sigmoid_silu_221.run(buf1015, buf1026, buf1027, 9934848, grid=grid(9934848), stream=stream0)
        buf1028 = empty_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_split_with_sizes_222.run(buf1027, buf1028, 4967424, grid=grid(4967424), stream=stream0)
        buf1029 = empty_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_split_with_sizes_223.run(buf1027, buf1029, 4967424, grid=grid(4967424), stream=stream0)
        buf1030 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_224.run(primals_290, buf1030, 104544, grid=grid(104544), stream=stream0)
        del primals_290
        buf1031 = extern_kernels.convolution(buf1028, buf1030, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf1031, (128, 132, 7, 7), (6468, 49, 7, 1))
        buf1032 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_224.run(primals_291, buf1032, 104544, grid=grid(104544), stream=stream0)
        del primals_291
        buf1033 = extern_kernels.convolution(buf1029, buf1032, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf1033, (128, 132, 7, 7), (6468, 49, 7, 1))
        buf1036 = empty_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda', dtype=torch.float16)
        buf1034 = as_strided(buf1036, (128, 132, 7, 7), (12936, 49, 7, 1))  # alias
        triton_poi_fused_cat_225.run(buf1031, buf1034, 827904, grid=grid(827904), stream=stream0)
        del buf1031
        buf1035 = as_strided(buf1036, (128, 132, 7, 7), (12936, 49, 7, 1), 6468)  # alias
        triton_poi_fused_cat_226.run(buf1033, buf1035, 827904, grid=grid(827904), stream=stream0)
        del buf1033
        buf1037 = buf978; del buf978  # reuse
        buf1038 = buf1037; del buf1037  # reuse
        buf1041 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
        buf1039 = empty_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda', dtype=torch.float32)
        buf1040 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
        buf1042 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_199.run(buf1038, buf1036, primals_466, primals_467, buf1041, buf1039, buf1040, buf1042, 264, 6272, grid=grid(264), stream=stream0)
        del primals_466
        del primals_467
        buf1043 = empty_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_227.run(buf1036, buf1038, buf1039, primals_107, primals_108, buf982, buf1043, 1655808, grid=grid(1655808), stream=stream0)
        del primals_108
        buf1044 = empty_strided((1584, 264, 1, 1), (264, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_201.run(primals_292, buf1044, 418176, grid=grid(418176), stream=stream0)
        del primals_292
        buf1045 = extern_kernels.convolution(buf1043, buf1044, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf1045, (128, 1584, 7, 7), (77616, 49, 7, 1))
        buf1046 = buf1011; del buf1011  # reuse
        buf1047 = buf1046; del buf1046  # reuse
        buf1050 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        buf1048 = empty_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda', dtype=torch.float32)
        buf1049 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        buf1051 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_202.run(buf1047, buf1045, primals_469, primals_470, buf1050, buf1048, buf1049, buf1051, 1584, 6272, grid=grid(1584), stream=stream0)
        del primals_469
        del primals_470
        buf1052 = buf1027; del buf1027  # reuse
        buf1122 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_203.run(buf1045, buf1047, buf1048, primals_109, primals_110, buf1052, buf1122, 9934848, grid=grid(9934848), stream=stream0)
        del primals_110
        buf1053 = empty_strided((396, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_204.run(primals_293, buf1053, 3564, grid=grid(3564), stream=stream0)
        del primals_293
        buf1054 = buf1003; del buf1003  # reuse
        triton_poi_fused_split_with_sizes_205.run(buf1052, buf1054, 2483712, grid=grid(2483712), stream=stream0)
        buf1055 = extern_kernels.convolution(buf1054, buf1053, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None)
        assert_size_stride(buf1055, (128, 396, 7, 7), (19404, 49, 7, 1))
        buf1056 = empty_strided((396, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_206.run(primals_294, buf1056, 9900, grid=grid(9900), stream=stream0)
        del primals_294
        buf1057 = buf1000; del buf1000  # reuse
        triton_poi_fused_split_with_sizes_207.run(buf1052, buf1057, 2483712, grid=grid(2483712), stream=stream0)
        buf1058 = extern_kernels.convolution(buf1057, buf1056, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None)
        assert_size_stride(buf1058, (128, 396, 7, 7), (19404, 49, 7, 1))
        buf1059 = empty_strided((396, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_208.run(primals_295, buf1059, 19404, grid=grid(19404), stream=stream0)
        del primals_295
        buf1060 = buf997; del buf997  # reuse
        triton_poi_fused_split_with_sizes_209.run(buf1052, buf1060, 2483712, grid=grid(2483712), stream=stream0)
        buf1061 = extern_kernels.convolution(buf1060, buf1059, stride=(1, 1), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None)
        assert_size_stride(buf1061, (128, 396, 7, 7), (19404, 49, 7, 1))
        buf1062 = empty_strided((396, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_210.run(primals_296, buf1062, 32076, grid=grid(32076), stream=stream0)
        del primals_296
        buf1063 = buf994; del buf994  # reuse
        triton_poi_fused_split_with_sizes_211.run(buf1052, buf1063, 2483712, grid=grid(2483712), stream=stream0)
        buf1064 = extern_kernels.convolution(buf1063, buf1062, stride=(1, 1), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None)
        assert_size_stride(buf1064, (128, 396, 7, 7), (19404, 49, 7, 1))
        buf1069 = buf1052; del buf1052  # reuse
        buf1065 = as_strided(buf1069, (128, 396, 7, 7), (77616, 49, 7, 1))  # alias
        triton_poi_fused_cat_212.run(buf1055, buf1065, 2483712, grid=grid(2483712), stream=stream0)
        del buf1055
        buf1066 = as_strided(buf1069, (128, 396, 7, 7), (77616, 49, 7, 1), 19404)  # alias
        triton_poi_fused_cat_213.run(buf1058, buf1066, 2483712, grid=grid(2483712), stream=stream0)
        del buf1058
        buf1067 = as_strided(buf1069, (128, 396, 7, 7), (77616, 49, 7, 1), 38808)  # alias
        triton_poi_fused_cat_213.run(buf1061, buf1067, 2483712, grid=grid(2483712), stream=stream0)
        del buf1061
        buf1068 = as_strided(buf1069, (128, 396, 7, 7), (77616, 49, 7, 1), 58212)  # alias
        triton_poi_fused_cat_213.run(buf1064, buf1068, 2483712, grid=grid(2483712), stream=stream0)
        del buf1064
        buf1070 = buf1048; del buf1048  # reuse
        buf1071 = buf1070; del buf1070  # reuse
        buf1074 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        buf1072 = empty_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda', dtype=torch.float32)
        buf1073 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        buf1075 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_202.run(buf1071, buf1069, primals_472, primals_473, buf1074, buf1072, buf1073, buf1075, 1584, 6272, grid=grid(1584), stream=stream0)
        del primals_472
        del primals_473
        buf1076 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16)
        buf1078 = empty_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_per_fused__native_batch_norm_legit_functional_mean_silu_214.run(buf1069, buf1071, buf1072, primals_111, primals_112, buf1076, buf1078, 202752, 49, grid=grid(202752), stream=stream0)
        del buf1072
        del primals_112
        buf1079 = empty_strided((132, 1584, 1, 1), (1584, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_215.run(primals_297, buf1079, 209088, grid=grid(209088), stream=stream0)
        del primals_297
        buf1080 = buf1019; del buf1019  # reuse
        triton_poi_fused__to_copy_convolution_216.run(primals_298, buf1080, 132, grid=grid(132), stream=stream0)
        del primals_298
        buf1081 = extern_kernels.convolution(buf1078, buf1079, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf1081, (128, 132, 1, 1), (132, 1, 1, 1))
        buf1082 = buf1081; del buf1081  # reuse
        buf1083 = empty_strided((128, 132, 1, 1), (132, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_convolution_silu_217.run(buf1082, buf1080, buf1083, 16896, grid=grid(16896), stream=stream0)
        del buf1080
        buf1084 = empty_strided((1584, 132, 1, 1), (132, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_218.run(primals_299, buf1084, 209088, grid=grid(209088), stream=stream0)
        del primals_299
        buf1085 = buf1024; del buf1024  # reuse
        triton_poi_fused__to_copy_convolution_219.run(primals_300, buf1085, 1584, grid=grid(1584), stream=stream0)
        del primals_300
        buf1086 = extern_kernels.convolution(buf1083, buf1084, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf1086, (128, 1584, 1, 1), (1584, 1, 1, 1))
        buf1087 = buf1086; del buf1086  # reuse
        triton_poi_fused__to_copy_convolution_220.run(buf1087, buf1085, 202752, grid=grid(202752), stream=stream0)
        del buf1085
        buf1088 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_mul_sigmoid_silu_221.run(buf1076, buf1087, buf1088, 9934848, grid=grid(9934848), stream=stream0)
        buf1089 = empty_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_split_with_sizes_222.run(buf1088, buf1089, 4967424, grid=grid(4967424), stream=stream0)
        buf1090 = empty_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused_split_with_sizes_223.run(buf1088, buf1090, 4967424, grid=grid(4967424), stream=stream0)
        del buf1088
        buf1091 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_224.run(primals_301, buf1091, 104544, grid=grid(104544), stream=stream0)
        del primals_301
        buf1092 = extern_kernels.convolution(buf1089, buf1091, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf1092, (128, 132, 7, 7), (6468, 49, 7, 1))
        buf1093 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_224.run(primals_302, buf1093, 104544, grid=grid(104544), stream=stream0)
        del primals_302
        buf1094 = extern_kernels.convolution(buf1090, buf1093, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf1094, (128, 132, 7, 7), (6468, 49, 7, 1))
        buf1097 = empty_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda', dtype=torch.float16)
        buf1095 = as_strided(buf1097, (128, 132, 7, 7), (12936, 49, 7, 1))  # alias
        triton_poi_fused_cat_225.run(buf1092, buf1095, 827904, grid=grid(827904), stream=stream0)
        del buf1092
        buf1096 = as_strided(buf1097, (128, 132, 7, 7), (12936, 49, 7, 1), 6468)  # alias
        triton_poi_fused_cat_226.run(buf1094, buf1096, 827904, grid=grid(827904), stream=stream0)
        del buf1094
        buf1098 = buf1039; del buf1039  # reuse
        buf1099 = buf1098; del buf1098  # reuse
        buf1102 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
        buf1100 = empty_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda', dtype=torch.float32)
        buf1101 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
        buf1103 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_199.run(buf1099, buf1097, primals_475, primals_476, buf1102, buf1100, buf1101, buf1103, 264, 6272, grid=grid(264), stream=stream0)
        del primals_475
        del primals_476
        buf1104 = empty_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__native_batch_norm_legit_functional_add_227.run(buf1097, buf1099, buf1100, primals_113, primals_114, buf1043, buf1104, 1655808, grid=grid(1655808), stream=stream0)
        del buf1100
        del primals_114
        buf1105 = empty_strided((1536, 264, 1, 1), (264, 1, 1, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_228.run(primals_303, buf1105, 405504, grid=grid(405504), stream=stream0)
        del primals_303
        buf1106 = extern_kernels.convolution(buf1104, buf1105, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
        assert_size_stride(buf1106, (128, 1536, 7, 7), (75264, 49, 7, 1))
        buf1107 = empty_strided((1, 1536, 1, 1), (1536, 1, 1536, 1536), device='cuda', dtype=torch.float32)
        buf1108 = buf1107; del buf1107  # reuse
        buf1111 = empty_strided((1536, ), (1, ), device='cuda', dtype=torch.float32)
        buf1109 = empty_strided((1, 1536, 1, 1), (1536, 1, 1536, 1536), device='cuda', dtype=torch.float32)
        buf1110 = empty_strided((1536, ), (1, ), device='cuda', dtype=torch.float32)
        buf1112 = empty_strided((1536, ), (1, ), device='cuda', dtype=torch.float32)
        triton_red_fused__native_batch_norm_legit_functional_229.run(buf1108, buf1106, primals_478, primals_479, buf1111, buf1109, buf1110, buf1112, 1536, 6272, grid=grid(1536), stream=stream0)
        del primals_478
        del primals_479
        buf1120 = empty_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda', dtype=torch.bool)
        buf1115 = empty_strided((128, 1536), (1536, 1), device='cuda', dtype=torch.float16)
        triton_per_fused__native_batch_norm_legit_functional_mean_relu_threshold_backward_view_230.run(buf1106, buf1108, buf1109, primals_115, primals_116, buf1120, buf1115, 196608, 49, grid=grid(196608), stream=stream0)
        del buf1109
        del primals_116
        buf1116 = empty_strided((1000, 1536), (1536, 1), device='cuda', dtype=torch.float16)
        buf1119 = empty_strided((1000, 1536), (1536, 1), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_t_231.run(primals_304, buf1116, buf1119, 1536000, grid=grid(1536000), stream=stream0)
        del primals_304
        buf1117 = empty_strided((1000, ), (1, ), device='cuda', dtype=torch.float16)
        triton_poi_fused__to_copy_232.run(primals_305, buf1117, 1000, grid=grid(1000), stream=stream0)
        del primals_305
        buf1118 = empty_strided((128, 1000), (1000, 1), device='cuda', dtype=torch.float16)
        extern_kernels.addmm(buf1117, buf1115, as_strided(buf1116, (1536, 1000), (1, 1536)), alpha=1, beta=1, out=buf1118)
        del buf1116
        del buf1117
        buf1156 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_306, buf1156, 1, grid=grid(1), stream=stream0)
        del primals_306
        buf1157 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_309, buf1157, 1, grid=grid(1), stream=stream0)
        del primals_309
        buf1158 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_312, buf1158, 1, grid=grid(1), stream=stream0)
        del primals_312
        buf1159 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_315, buf1159, 1, grid=grid(1), stream=stream0)
        del primals_315
        buf1160 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_318, buf1160, 1, grid=grid(1), stream=stream0)
        del primals_318
        buf1161 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_321, buf1161, 1, grid=grid(1), stream=stream0)
        del primals_321
        buf1162 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_324, buf1162, 1, grid=grid(1), stream=stream0)
        del primals_324
        buf1163 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_327, buf1163, 1, grid=grid(1), stream=stream0)
        del primals_327
        buf1164 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_330, buf1164, 1, grid=grid(1), stream=stream0)
        del primals_330
        buf1165 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_333, buf1165, 1, grid=grid(1), stream=stream0)
        del primals_333
        buf1166 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_336, buf1166, 1, grid=grid(1), stream=stream0)
        del primals_336
        buf1167 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_339, buf1167, 1, grid=grid(1), stream=stream0)
        del primals_339
        buf1168 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_342, buf1168, 1, grid=grid(1), stream=stream0)
        del primals_342
        buf1169 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_345, buf1169, 1, grid=grid(1), stream=stream0)
        del primals_345
        buf1170 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_348, buf1170, 1, grid=grid(1), stream=stream0)
        del primals_348
        buf1171 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_351, buf1171, 1, grid=grid(1), stream=stream0)
        del primals_351
        buf1172 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_354, buf1172, 1, grid=grid(1), stream=stream0)
        del primals_354
        buf1173 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_357, buf1173, 1, grid=grid(1), stream=stream0)
        del primals_357
        buf1174 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_360, buf1174, 1, grid=grid(1), stream=stream0)
        del primals_360
        buf1175 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_363, buf1175, 1, grid=grid(1), stream=stream0)
        del primals_363
        buf1176 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_366, buf1176, 1, grid=grid(1), stream=stream0)
        del primals_366
        buf1177 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_369, buf1177, 1, grid=grid(1), stream=stream0)
        del primals_369
        buf1178 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_372, buf1178, 1, grid=grid(1), stream=stream0)
        del primals_372
        buf1179 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_375, buf1179, 1, grid=grid(1), stream=stream0)
        del primals_375
        buf1180 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_378, buf1180, 1, grid=grid(1), stream=stream0)
        del primals_378
        buf1181 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_381, buf1181, 1, grid=grid(1), stream=stream0)
        del primals_381
        buf1182 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_384, buf1182, 1, grid=grid(1), stream=stream0)
        del primals_384
        buf1183 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_387, buf1183, 1, grid=grid(1), stream=stream0)
        del primals_387
        buf1184 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_390, buf1184, 1, grid=grid(1), stream=stream0)
        del primals_390
        buf1185 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_393, buf1185, 1, grid=grid(1), stream=stream0)
        del primals_393
        buf1186 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_396, buf1186, 1, grid=grid(1), stream=stream0)
        del primals_396
        buf1187 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_399, buf1187, 1, grid=grid(1), stream=stream0)
        del primals_399
        buf1188 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_402, buf1188, 1, grid=grid(1), stream=stream0)
        del primals_402
        buf1189 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_405, buf1189, 1, grid=grid(1), stream=stream0)
        del primals_405
        buf1190 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_408, buf1190, 1, grid=grid(1), stream=stream0)
        del primals_408
        buf1191 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_411, buf1191, 1, grid=grid(1), stream=stream0)
        del primals_411
        buf1192 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_414, buf1192, 1, grid=grid(1), stream=stream0)
        del primals_414
        buf1193 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_417, buf1193, 1, grid=grid(1), stream=stream0)
        del primals_417
        buf1194 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_420, buf1194, 1, grid=grid(1), stream=stream0)
        del primals_420
        buf1195 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_423, buf1195, 1, grid=grid(1), stream=stream0)
        del primals_423
        buf1196 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_426, buf1196, 1, grid=grid(1), stream=stream0)
        del primals_426
        buf1197 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_429, buf1197, 1, grid=grid(1), stream=stream0)
        del primals_429
        buf1198 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_432, buf1198, 1, grid=grid(1), stream=stream0)
        del primals_432
        buf1199 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_435, buf1199, 1, grid=grid(1), stream=stream0)
        del primals_435
        buf1200 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_438, buf1200, 1, grid=grid(1), stream=stream0)
        del primals_438
        buf1201 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_441, buf1201, 1, grid=grid(1), stream=stream0)
        del primals_441
        buf1202 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_444, buf1202, 1, grid=grid(1), stream=stream0)
        del primals_444
        buf1203 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_447, buf1203, 1, grid=grid(1), stream=stream0)
        del primals_447
        buf1204 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_450, buf1204, 1, grid=grid(1), stream=stream0)
        del primals_450
        buf1205 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_453, buf1205, 1, grid=grid(1), stream=stream0)
        del primals_453
        buf1206 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_456, buf1206, 1, grid=grid(1), stream=stream0)
        del primals_456
        buf1207 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_459, buf1207, 1, grid=grid(1), stream=stream0)
        del primals_459
        buf1208 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_462, buf1208, 1, grid=grid(1), stream=stream0)
        del primals_462
        buf1209 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_465, buf1209, 1, grid=grid(1), stream=stream0)
        del primals_465
        buf1210 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_468, buf1210, 1, grid=grid(1), stream=stream0)
        del primals_468
        buf1211 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_471, buf1211, 1, grid=grid(1), stream=stream0)
        del primals_471
        buf1212 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_474, buf1212, 1, grid=grid(1), stream=stream0)
        del primals_474
        buf1213 = empty_strided((), (), device='cuda', dtype=torch.int64)
        triton_poi_fused_add_233.run(primals_477, buf1213, 1, grid=grid(1), stream=stream0)
        del primals_477
        return (buf1156, buf9, buf10, buf1157, buf20, buf21, buf1158, buf31, buf32, buf1159, buf49, buf50, buf1160, buf68, buf69, buf1161, buf84, buf85, buf1162, buf100, buf101, buf1163, buf111, buf112, buf1164, buf127, buf128, buf1165, buf136, buf137, buf1166, buf160, buf161, buf1167, buf183, buf184, buf1168, buf197, buf198, buf1169, buf213, buf214, buf1170, buf243, buf244, buf1171, buf257, buf258, buf1172, buf273, buf274, buf1173, buf303, buf304, buf1174, buf317, buf318, buf1175, buf333, buf334, buf1176, buf363, buf364, buf1177, buf372, buf373, buf1178, buf392, buf393, buf1179, buf415, buf416, buf1180, buf429, buf430, buf1181, buf453, buf454, buf1182, buf483, buf484, buf1183, buf497, buf498, buf1184, buf521, buf522, buf1185, buf551, buf552, buf1186, buf565, buf566, buf1187, buf589, buf590, buf1188, buf619, buf620, buf1189, buf628, buf629, buf1190, buf638, buf639, buf1191, buf661, buf662, buf1192, buf675, buf676, buf1193, buf699, buf700, buf1194, buf729, buf730, buf1195, buf743, buf744, buf1196, buf767, buf768, buf1197, buf797, buf798, buf1198, buf811, buf812, buf1199, buf835, buf836, buf1200, buf865, buf866, buf1201, buf874, buf875, buf1202, buf898, buf899, buf1203, buf919, buf920, buf1204, buf928, buf929, buf1205, buf952, buf953, buf1206, buf980, buf981, buf1207, buf989, buf990, buf1208, buf1013, buf1014, buf1209, buf1041, buf1042, buf1210, buf1050, buf1051, buf1211, buf1074, buf1075, buf1212, buf1102, buf1103, buf1213, buf1111, buf1112, buf1118, primals_1, primals_3, primals_5, primals_7, primals_9, primals_11, primals_13, primals_15, primals_17, primals_19, primals_21, primals_23, primals_25, primals_27, primals_29, primals_31, primals_33, primals_35, primals_37, primals_39, primals_41, primals_43, primals_45, primals_47, primals_49, primals_51, primals_53, primals_55, primals_57, primals_59, primals_61, primals_63, primals_65, primals_67, primals_69, primals_71, primals_73, primals_75, primals_77, primals_79, primals_81, primals_83, primals_85, primals_87, primals_89, primals_91, primals_93, primals_95, primals_97, primals_99, primals_101, primals_103, primals_105, primals_107, primals_109, primals_111, primals_113, primals_115, buf0, buf1, buf2, buf8, buf11, buf12, buf13, buf19, buf22, buf23, buf24, buf30, buf34, buf35, buf36, buf38, buf42, buf48, buf52, as_strided(buf51, (128, 64, 112, 112), (2408448, 12544, 112, 1)), buf54, as_strided(buf51, (128, 64, 112, 112), (2408448, 12544, 112, 1), 802816), buf56, as_strided(buf51, (128, 64, 112, 112), (2408448, 12544, 112, 1), 1605632), buf61, buf67, buf71, as_strided(buf70, (128, 96, 56, 56), (602112, 3136, 56, 1)), buf73, as_strided(buf70, (128, 96, 56, 56), (602112, 3136, 56, 1), 301056), buf77, buf83, as_strided(buf86, (128, 20, 56, 56), (125440, 3136, 56, 1)), as_strided(buf86, (128, 20, 56, 56), (125440, 3136, 56, 1), 62720), buf87, buf89, buf93, buf99, buf102, buf103, buf104, buf110, buf114, as_strided(buf113, (128, 60, 56, 56), (376320, 3136, 56, 1)), buf116, as_strided(buf113, (128, 60, 56, 56), (376320, 3136, 56, 1), 188160), buf120, buf126, buf129, buf130, buf131, buf135, buf139, buf140, buf142, buf143, buf145, buf146, buf148, buf149, buf155, buf159, buf162, buf164, buf165, buf168, buf169, buf170, buf173, buf174, buf175, buf176, buf182, as_strided(buf185, (128, 28, 28, 28), (43904, 784, 28, 1)), as_strided(buf185, (128, 28, 28, 28), (43904, 784, 28, 1), 21952), buf186, buf188, buf192, buf196, buf200, buf201, buf203, buf204, buf208, buf212, buf215, buf217, buf218, buf221, buf222, buf223, buf226, buf228, buf229, buf230, buf232, buf236, buf242, as_strided(buf245, (128, 28, 28, 28), (43904, 784, 28, 1)), as_strided(buf245, (128, 28, 28, 28), (43904, 784, 28, 1), 21952), buf246, buf248, buf252, buf256, buf260, buf261, buf263, buf264, buf268, buf272, buf275, buf277, buf278, buf281, buf282, buf283, buf286, buf288, buf289, buf290, buf292, buf296, buf302, as_strided(buf305, (128, 28, 28, 28), (43904, 784, 28, 1)), as_strided(buf305, (128, 28, 28, 28), (43904, 784, 28, 1), 21952), buf306, buf308, buf312, buf316, buf320, buf321, buf323, buf324, buf328, buf332, buf335, buf337, buf338, buf341, buf342, buf343, buf346, buf348, buf349, buf350, buf352, buf356, buf362, buf365, buf366, buf367, buf371, buf375, buf376, buf378, buf379, buf381, buf382, buf387, buf391, buf394, buf396, buf397, buf400, buf401, buf402, buf405, buf406, buf407, buf408, buf414, as_strided(buf417, (128, 52, 14, 14), (20384, 196, 14, 1)), as_strided(buf417, (128, 52, 14, 14), (20384, 196, 14, 1), 10192), buf418, buf420, buf424, buf428, buf432, buf433, buf435, buf436, buf438, buf439, buf441, buf442, buf448, buf452, buf455, buf457, buf458, buf461, buf462, buf463, buf466, buf468, buf469, buf470, buf472, buf476, buf482, as_strided(buf485, (128, 52, 14, 14), (20384, 196, 14, 1)), as_strided(buf485, (128, 52, 14, 14), (20384, 196, 14, 1), 10192), buf486, buf488, buf492, buf496, buf500, buf501, buf503, buf504, buf506, buf507, buf509, buf510, buf516, buf520, buf523, buf525, buf526, buf529, buf530, buf531, buf534, buf536, buf537, buf538, buf540, buf544, buf550, as_strided(buf553, (128, 52, 14, 14), (20384, 196, 14, 1)), as_strided(buf553, (128, 52, 14, 14), (20384, 196, 14, 1), 10192), buf554, buf556, buf560, buf564, buf568, buf569, buf571, buf572, buf574, buf575, buf577, buf578, buf584, buf588, buf591, buf593, buf594, buf597, buf598, buf599, buf602, buf604, buf605, buf606, buf608, buf612, buf618, buf621, buf622, buf623, buf627, buf631, buf632, buf633, buf637, buf640, buf642, buf643, buf646, buf647, buf648, buf651, buf652, buf653, buf654, buf660, as_strided(buf663, (128, 80, 14, 14), (31360, 196, 14, 1)), as_strided(buf663, (128, 80, 14, 14), (31360, 196, 14, 1), 15680), buf664, buf666, buf670, buf674, buf678, buf679, buf681, buf682, buf684, buf685, buf687, buf688, buf694, buf698, buf701, buf703, buf704, buf707, buf708, buf709, buf712, buf714, buf715, buf716, buf718, buf722, buf728, as_strided(buf731, (128, 80, 14, 14), (31360, 196, 14, 1)), as_strided(buf731, (128, 80, 14, 14), (31360, 196, 14, 1), 15680), buf732, buf734, buf738, buf742, buf746, buf747, buf749, buf750, buf752, buf753, buf755, buf756, buf762, buf766, buf769, buf771, buf772, buf775, buf776, buf777, buf780, buf782, buf783, buf784, buf786, buf790, buf796, as_strided(buf799, (128, 80, 14, 14), (31360, 196, 14, 1)), as_strided(buf799, (128, 80, 14, 14), (31360, 196, 14, 1), 15680), buf800, buf802, buf806, buf810, buf814, buf815, buf817, buf818, buf820, buf821, buf823, buf824, buf830, buf834, buf837, buf839, buf840, buf843, buf844, buf845, buf848, buf850, buf851, buf852, buf854, buf858, buf864, buf867, buf868, buf869, buf873, buf877, buf878, buf880, buf881, buf883, buf884, buf886, buf887, buf893, buf897, buf900, buf902, buf903, buf906, buf907, buf908, buf911, buf912, buf913, buf914, buf918, buf921, buf922, buf923, buf927, buf931, buf932, buf934, buf935, buf937, buf938, buf940, buf941, buf947, buf951, buf954, buf956, buf957, buf960, buf961, buf962, buf965, buf967, buf968, buf969, buf971, buf975, buf979, buf982, buf983, buf984, buf988, buf992, buf993, buf995, buf996, buf998, buf999, buf1001, buf1002, buf1008, buf1012, buf1015, buf1017, buf1018, buf1021, buf1022, buf1023, buf1026, buf1028, buf1029, buf1030, buf1032, buf1036, buf1040, buf1043, buf1044, buf1045, buf1049, buf1053, buf1054, buf1056, buf1057, buf1059, buf1060, buf1062, buf1063, buf1069, buf1073, buf1076, buf1078, buf1079, buf1082, buf1083, buf1084, buf1087, buf1089, buf1090, buf1091, buf1093, buf1097, buf1101, buf1104, buf1105, buf1106, buf1110, buf1115, buf1119, buf1120, as_strided(buf1108, (1, 1536, 1, 1), (1536, 1, 1, 1)), as_strided(buf1099, (1, 264, 1, 1), (264, 1, 1, 1)), as_strided(buf1071, (1, 1584, 1, 1), (1584, 1, 1, 1)), buf1122, as_strided(buf1047, (1, 1584, 1, 1), (1584, 1, 1, 1)), as_strided(buf1038, (1, 264, 1, 1), (264, 1, 1, 1)), as_strided(buf1010, (1, 1584, 1, 1), (1584, 1, 1, 1)), buf1124, as_strided(buf986, (1, 1584, 1, 1), (1584, 1, 1, 1)), as_strided(buf977, (1, 264, 1, 1), (264, 1, 1, 1)), as_strided(buf949, (1, 1584, 1, 1), (1584, 1, 1, 1)), buf1126, as_strided(buf925, (1, 1584, 1, 1), (1584, 1, 1, 1)), as_strided(buf916, (1, 264, 1, 1), (264, 1, 1, 1)), as_strided(buf895, (1, 960, 1, 1), (960, 1, 1, 1)), buf1128, as_strided(buf871, (1, 960, 1, 1), (960, 1, 1, 1)), as_strided(buf861, (1, 160, 1, 1), (160, 1, 1, 1)), as_strided(buf832, (1, 480, 1, 1), (480, 1, 1, 1)), buf1130, as_strided(buf808, (1, 480, 1, 1), (480, 1, 1, 1)), as_strided(buf793, (1, 160, 1, 1), (160, 1, 1, 1)), as_strided(buf764, (1, 480, 1, 1), (480, 1, 1, 1)), buf1132, as_strided(buf740, (1, 480, 1, 1), (480, 1, 1, 1)), as_strided(buf725, (1, 160, 1, 1), (160, 1, 1, 1)), as_strided(buf696, (1, 480, 1, 1), (480, 1, 1, 1)), buf1134, as_strided(buf672, (1, 480, 1, 1), (480, 1, 1, 1)), as_strided(buf657, (1, 160, 1, 1), (160, 1, 1, 1)), as_strided(buf635, (1, 624, 1, 1), (624, 1, 1, 1)), buf1136, as_strided(buf625, (1, 624, 1, 1), (624, 1, 1, 1)), as_strided(buf615, (1, 104, 1, 1), (104, 1, 1, 1)), as_strided(buf586, (1, 624, 1, 1), (624, 1, 1, 1)), buf1138, as_strided(buf562, (1, 624, 1, 1), (624, 1, 1, 1)), as_strided(buf547, (1, 104, 1, 1), (104, 1, 1, 1)), as_strided(buf518, (1, 624, 1, 1), (624, 1, 1, 1)), buf1140, as_strided(buf494, (1, 624, 1, 1), (624, 1, 1, 1)), as_strided(buf479, (1, 104, 1, 1), (104, 1, 1, 1)), as_strided(buf450, (1, 624, 1, 1), (624, 1, 1, 1)), buf1142, as_strided(buf426, (1, 624, 1, 1), (624, 1, 1, 1)), as_strided(buf411, (1, 104, 1, 1), (104, 1, 1, 1)), as_strided(buf389, (1, 336, 1, 1), (336, 1, 1, 1)), buf1144, as_strided(buf369, (1, 336, 1, 1), (336, 1, 1, 1)), as_strided(buf359, (1, 56, 1, 1), (56, 1, 1, 1)), as_strided(buf330, (1, 336, 1, 1), (336, 1, 1, 1)), buf1146, as_strided(buf314, (1, 336, 1, 1), (336, 1, 1, 1)), as_strided(buf299, (1, 56, 1, 1), (56, 1, 1, 1)), as_strided(buf270, (1, 336, 1, 1), (336, 1, 1, 1)), buf1148, as_strided(buf254, (1, 336, 1, 1), (336, 1, 1, 1)), as_strided(buf239, (1, 56, 1, 1), (56, 1, 1, 1)), as_strided(buf210, (1, 336, 1, 1), (336, 1, 1, 1)), buf1150, as_strided(buf194, (1, 336, 1, 1), (336, 1, 1, 1)), as_strided(buf179, (1, 56, 1, 1), (56, 1, 1, 1)), as_strided(buf157, (1, 240, 1, 1), (240, 1, 1, 1)), buf1152, as_strided(buf133, (1, 240, 1, 1), (240, 1, 1, 1)), as_strided(buf123, (1, 40, 1, 1), (40, 1, 1, 1)), buf1153, as_strided(buf107, (1, 120, 1, 1), (120, 1, 1, 1)), as_strided(buf96, (1, 120, 1, 1), (120, 1, 1, 1)), as_strided(buf80, (1, 40, 1, 1), (40, 1, 1, 1)), buf1154, as_strided(buf64, (1, 192, 1, 1), (192, 1, 1, 1)), buf1155, as_strided(buf45, (1, 192, 1, 1), (192, 1, 1, 1)), as_strided(buf27, (1, 32, 1, 1), (32, 1, 1, 1)), as_strided(buf16, (1, 32, 1, 1), (32, 1, 1, 1)), as_strided(buf5, (1, 32, 1, 1), (32, 1, 1, 1)), )


def benchmark_compiled_module(times=10, repeat=10):
    from torch._dynamo.testing import rand_strided
    from torch._inductor.utils import print_performance
    primals_1 = rand_strided((32, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_2 = rand_strided((32, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_3 = rand_strided((32, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_4 = rand_strided((32, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_5 = rand_strided((32, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_6 = rand_strided((32, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_7 = rand_strided((192, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_8 = rand_strided((192, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_9 = rand_strided((192, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_10 = rand_strided((192, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_11 = rand_strided((40, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_12 = rand_strided((40, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_13 = rand_strided((120, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_14 = rand_strided((120, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_15 = rand_strided((120, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_16 = rand_strided((120, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_17 = rand_strided((40, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_18 = rand_strided((40, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_19 = rand_strided((240, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_20 = rand_strided((240, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_21 = rand_strided((240, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_22 = rand_strided((240, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_23 = rand_strided((56, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_24 = rand_strided((56, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_25 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_26 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_27 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_28 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_29 = rand_strided((56, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_30 = rand_strided((56, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_31 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_32 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_33 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_34 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_35 = rand_strided((56, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_36 = rand_strided((56, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_37 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_38 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_39 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_40 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_41 = rand_strided((56, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_42 = rand_strided((56, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_43 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_44 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_45 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_46 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_47 = rand_strided((104, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_48 = rand_strided((104, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_49 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_50 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_51 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_52 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_53 = rand_strided((104, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_54 = rand_strided((104, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_55 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_56 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_57 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_58 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_59 = rand_strided((104, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_60 = rand_strided((104, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_61 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_62 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_63 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_64 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_65 = rand_strided((104, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_66 = rand_strided((104, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_67 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_68 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_69 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_70 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_71 = rand_strided((160, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_72 = rand_strided((160, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_73 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_74 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_75 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_76 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_77 = rand_strided((160, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_78 = rand_strided((160, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_79 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_80 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_81 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_82 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_83 = rand_strided((160, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_84 = rand_strided((160, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_85 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_86 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_87 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_88 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_89 = rand_strided((160, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_90 = rand_strided((160, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_91 = rand_strided((960, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_92 = rand_strided((960, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_93 = rand_strided((960, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_94 = rand_strided((960, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_95 = rand_strided((264, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_96 = rand_strided((264, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_97 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_98 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_99 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_100 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_101 = rand_strided((264, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_102 = rand_strided((264, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_103 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_104 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_105 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_106 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_107 = rand_strided((264, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_108 = rand_strided((264, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_109 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_110 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_111 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_112 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_113 = rand_strided((264, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_114 = rand_strided((264, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_115 = rand_strided((1536, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_116 = rand_strided((1536, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_117 = rand_strided((32, 3, 3, 3), (27, 9, 3, 1), device='cuda:0', dtype=torch.float32)
    primals_118 = rand_strided((32, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
    primals_119 = rand_strided((32, 32, 1, 1), (32, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    primals_120 = rand_strided((96, 16, 1, 1), (16, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    primals_121 = rand_strided((96, 16, 1, 1), (16, 1, 1, 1), device='cuda:0', dtype=torch.float32)
    primals_122 = rand_strided((64, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
    primals_123 = rand_strided((64, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dty