Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save shunting314/c2a4d8a28b00fcb5586d0e9d9bf77f9f to your computer and use it in GitHub Desktop.
Save shunting314/c2a4d8a28b00fcb5586d0e9d9bf77f9f to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from torch._inductor.utils import maybe_profile
from torch import empty_strided, as_strided, device
from torch._inductor.codecache import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
aten = torch.ops.aten
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
async_compile = AsyncCompile()
import triton
import triton.language as tl
from torch._inductor.triton_heuristics import grid, start_graph, end_graph
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
# kernel path: /tmp/torchinductor_shunting/cd/ccd7accawcy3jndrehd4geosuzoh6rxkzjlqnojxb5msgrcv7udi.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type
triton_poi_fused__to_copy_0 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 864
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((32, 3, 3, 3), (27, 9, 3, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((32, 3, 3, 3), (27, 9, 3, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_0.run(*args, 864, grid=grid(864), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_0.benchmark_all_configs(*args, 864, grid=grid(864))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/4z/c4zfgpaaduweifmtujob2ufcjjkrkqa7mb5j2qaaysmstief35xb.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_1
triton_poi_fused__to_copy_1 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_1(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 19267584
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), None)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None)
def get_args():
arg_0 = rand_strided((128, 3, 224, 224), (150528, 50176, 224, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((128, 3, 224, 224), (150528, 50176, 224, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_1.run(*args, 19267584, grid=grid(19267584), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_1.benchmark_all_configs(*args, 19267584, grid=grid(19267584))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/4v/c4vmg7ssybcn2qvzgu3tqbkt62uopduyelvkcwxaqv2fc4glkb57.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_2, var_mean
triton_red_fused__native_batch_norm_legit_functional_2 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 131072],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_2(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 448
rnumel = 114688
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex % 32
x1 = (xindex // 32)
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = tl.load(in_ptr0 + ((112*(((r2 + (114688*x1)) // 112) % 112)) + (12544*x0) + (401408*((r2 + (114688*x1)) // 12544)) + (r2 % 112)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tl.store(out_ptr0 + x3, tmp2, xmask)
def get_args():
arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 32, 1, 1, 14), (448, 1, 448, 448, 32), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_2.run(*args, 448, 114688, grid=grid(448), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_2.benchmark_all_configs(*args, 448, 114688, grid=grid(448))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ax/caxxuenolffrdcs4fes5eiayfgrbnuyzkml277uyriuhejqsq7k4.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_2, convert_element_type_2, mul_1, mul_2, var_mean
triton_per_fused__native_batch_norm_legit_functional_3 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[32, 16],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_3(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 32
rnumel = 14
RBLOCK: tl.constexpr = 16
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (32*r1)), rmask & xmask, other=0)
tmp8 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 1605632.0
tmp5 = tmp3 / tmp4
tmp6 = 0.1
tmp7 = tmp5 * tmp6
tmp9 = 0.9
tmp10 = tmp8 * tmp9
tmp11 = tmp7 + tmp10
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask)
def get_args():
arg_0 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((1, 32, 1, 1, 14), (448, 1, 448, 448, 32), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_3.run(*args, 32, 14, grid=grid(32), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_3.benchmark_all_configs(*args, 32, 14, grid=grid(32))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/4s/c4sugoluux54xogsh3dgakggor26tpyllwcenrcu3hdxyyol5rhg.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_2, var_mean
triton_red_fused__native_batch_norm_legit_functional_4 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 131072],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_4(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 448
rnumel = 114688
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex % 32
x1 = (xindex // 32)
tmp2 = tl.load(in_ptr1 + (x0), xmask)
_tmp5 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = tl.load(in_ptr0 + ((112*(((r2 + (114688*x1)) // 112) % 112)) + (12544*x0) + (401408*((r2 + (114688*x1)) // 12544)) + (r2 % 112)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp4 = tmp3 * tmp3
_tmp5 = tl.where(rmask & xmask, _tmp5 + tmp4, _tmp5)
tmp5 = tl.sum(_tmp5, 1)[:, None]
tl.store(out_ptr0 + x3, tmp5, xmask)
def get_args():
arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 32, 1, 1, 14), (448, 1, 448, 448, 32), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_4.run(*args, 448, 114688, grid=grid(448), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_4.benchmark_all_configs(*args, 448, 114688, grid=grid(448))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/yz/cyzfzxq5m7mzh26evqlnxbs2f4624mxxzlbzx5v335w7yujg6hp5.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_1, add_3, convert_element_type_2, mul_3, mul_4, mul_5, rsqrt, squeeze_1, var_mean
triton_per_fused__native_batch_norm_legit_functional_5 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[32, 16],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_5(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 32
rnumel = 14
RBLOCK: tl.constexpr = 16
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (32*r1)), rmask & xmask, other=0)
tmp13 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 1605632.0
tmp5 = tmp3 / tmp4
tmp6 = 1e-05
tmp7 = tmp5 + tmp6
tmp8 = tl.math.rsqrt(tmp7)
tmp9 = 1.0000006228081046
tmp10 = tmp5 * tmp9
tmp11 = 0.1
tmp12 = tmp10 * tmp11
tmp14 = 0.9
tmp15 = tmp13 * tmp14
tmp16 = tmp12 + tmp15
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask)
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask)
tl.store(out_ptr0 + x0, tmp3, xmask)
def get_args():
arg_0 = rand_strided((1, 32, 1, 1, 14), (448, 1, 448, 448, 32), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_5.run(*args, 32, 14, grid=grid(32), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_5.benchmark_all_configs(*args, 32, 14, grid=grid(32))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/tp/ctp3rh47jmesldsykk2bkq62s2n65nizu7jrip6s664llvntz6jk.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.relu
# aten._native_batch_norm_legit_functional => add_1, add_4, convert_element_type_2, convert_element_type_3, mul, mul_6, rsqrt, sub, var_mean
# aten.relu => relu
triton_poi_fused__native_batch_norm_legit_functional_relu_6 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_relu_6(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 51380224
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 12544) % 32
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 1605632.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tl.where(0 != 0, 0, tl.where(0 > tmp15, 0, tmp15))
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp16, None)
def get_args():
arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_relu_6.run(*args, 51380224, grid=grid(51380224), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_relu_6.benchmark_all_configs(*args, 51380224, grid=grid(51380224))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/bt/cbt6gouhluogn74n5bu4s6rx7uqiiw6lwhnqaggedwz4abkhco2b.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_4
triton_poi_fused__to_copy_7 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[512], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_7(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 288
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((32, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((32, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_7.run(*args, 288, grid=grid(288), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_7.benchmark_all_configs(*args, 288, grid=grid(288))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/tq/ctqzpnsuriwiat75pazkds4p5xe54pkhlftlkhx4o3jngunwgzub.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_7
triton_poi_fused__to_copy_8 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_8(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1024
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((32, 32, 1, 1), (32, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((32, 32, 1, 1), (32, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_8.run(*args, 1024, grid=grid(1024), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_8.benchmark_all_configs(*args, 1024, grid=grid(1024))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ma/cma4u56mzuidifjz56xk54ie6sgj6png7qekm2tx4cs7sle553zr.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add
# aten._native_batch_norm_legit_functional => add_11, add_14, convert_element_type_8, convert_element_type_9, mul_14, mul_20, rsqrt_2, sub_2, var_mean_2
# aten.add => add_15
triton_poi_fused__native_batch_norm_legit_functional_add_9 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_9(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 51380224
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 12544) % 32
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp16 = tl.load(in_ptr5 + (x3), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 1605632.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp17 = tmp15 + tmp16
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp17, None)
def get_args():
arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_9.run(*args, 51380224, grid=grid(51380224), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_9.benchmark_all_configs(*args, 51380224, grid=grid(51380224))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/mf/cmfq2hg7iduys7x4z6irkzuujylw3e7gj4ysdrcbo5cemukrg3t5.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_6
triton_poi_fused_split_with_sizes_10 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_10(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 25690112
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 200704
x1 = (xindex // 200704)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (401408*x1)), None).to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 16, 112, 112), (200704, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_10.run(*args, 25690112, grid=grid(25690112), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_10.benchmark_all_configs(*args, 25690112, grid=grid(25690112))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/lr/clrdqz4vueql2no5zrrsw4bgmmsgoov4oautycbtlc4q5ypfrpxe.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_7
triton_poi_fused_split_with_sizes_11 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_11(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 25690112
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 200704
x1 = (xindex // 200704)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (200704 + x0 + (401408*x1)), None).to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 16, 112, 112), (200704, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_11.run(*args, 25690112, grid=grid(25690112), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_11.benchmark_all_configs(*args, 25690112, grid=grid(25690112))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/wk/cwkoecpvylkd2zd4p2qzrsxvdjpmlhjy2vqltsena3lm7tbpxnqv.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_10
triton_poi_fused__to_copy_12 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_12(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1536
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((96, 16, 1, 1), (16, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((96, 16, 1, 1), (16, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_12.run(*args, 1536, grid=grid(1536), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_12.benchmark_all_configs(*args, 1536, grid=grid(1536))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/yj/cyjbfwxbzmq4snbyqnu5ehgb3xenlxtbixoczmyjqmdjbuu22kqa.py
# Original ATen: aten.cat
# aten.cat => cat
triton_poi_fused_cat_13 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[268435456], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_13(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 154140672
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 1204224
x1 = (xindex // 1204224)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (2408448*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 96, 112, 112), (1204224, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 96, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_13.run(*args, 154140672, grid=grid(154140672), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_13.benchmark_all_configs(*args, 154140672, grid=grid(154140672))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/nh/cnh3nbcj2c5nraypyukh7gkg7ocvsbjrgj7ok7wc472nt34wy65s.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_12, var_mean_3
triton_red_fused__native_batch_norm_legit_functional_14 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[4096, 131072],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_14(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 2496
rnumel = 123511
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x1 = (xindex // 192)
x0 = xindex % 192
_tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = r2 + (123511*x1)
tmp1 = 1605632
tmp2 = tmp0 < tmp1
tmp3 = tl.load(in_ptr0 + ((12544*x0) + (2408448*(((r2 + (123511*x1)) // 12544) % 128)) + ((r2 + (123511*x1)) % 12544) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp4 = tmp3.to(tl.float32)
tmp5 = tl.where(tmp2, tmp4, 0)
_tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6)
tmp6 = tl.sum(_tmp6, 1)[:, None]
tl.store(out_ptr0 + x3, tmp6, xmask)
def get_args():
arg_0 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 192, 1, 1, 13), (2496, 1, 2496, 2496, 192), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_14.run(*args, 2496, 123511, grid=grid(2496), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_14.benchmark_all_configs(*args, 2496, 123511, grid=grid(2496))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ed/cedbp3dhcc6xvctcve537pwhmiwi2ebf5oi5f5bppm5whslbp34w.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_18, convert_element_type_12, mul_22, mul_23, var_mean_3
triton_per_fused__native_batch_norm_legit_functional_15 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[256, 16],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_15(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 192
rnumel = 13
RBLOCK: tl.constexpr = 16
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (192*r1)), rmask & xmask, other=0)
tmp8 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 1605632.0
tmp5 = tmp3 / tmp4
tmp6 = 0.1
tmp7 = tmp5 * tmp6
tmp9 = 0.9
tmp10 = tmp8 * tmp9
tmp11 = tmp7 + tmp10
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask)
def get_args():
arg_0 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((1, 192, 1, 1, 13), (2496, 1, 2496, 2496, 192), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_15.run(*args, 192, 13, grid=grid(192), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_15.benchmark_all_configs(*args, 192, 13, grid=grid(192))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/jk/cjk2vm3446xrk7rth7hr6pun7xxo3dnzubwcn6ydrpifal4eykrz.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_12, var_mean_3
triton_red_fused__native_batch_norm_legit_functional_16 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[4096, 131072],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_16(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 2496
rnumel = 123511
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x1 = (xindex // 192)
x0 = xindex % 192
_tmp9 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = r2 + (123511*x1)
tmp1 = 1605632
tmp2 = tmp0 < tmp1
tmp3 = tl.load(in_ptr0 + ((12544*x0) + (2408448*(((r2 + (123511*x1)) // 12544) % 128)) + ((r2 + (123511*x1)) % 12544) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp4 = tmp3.to(tl.float32)
tmp5 = tl.load(in_ptr1 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0)
tmp6 = tmp4 - tmp5
tmp7 = tmp6 * tmp6
tmp8 = tl.where(tmp2, tmp7, 0)
_tmp9 = tl.where(rmask & xmask, _tmp9 + tmp8, _tmp9)
tmp9 = tl.sum(_tmp9, 1)[:, None]
tl.store(out_ptr0 + x3, tmp9, xmask)
def get_args():
arg_0 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 192, 1, 1, 13), (2496, 1, 2496, 2496, 192), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_16.run(*args, 2496, 123511, grid=grid(2496), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_16.benchmark_all_configs(*args, 2496, 123511, grid=grid(2496))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/r2/cr2l4iahxzbm4xda53lhc7nels7ttppa4wfso67ohv6lwu4ut2ei.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_17, add_19, convert_element_type_12, mul_24, mul_25, mul_26, rsqrt_3, squeeze_10, var_mean_3
triton_per_fused__native_batch_norm_legit_functional_17 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[256, 16],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_17(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 192
rnumel = 13
RBLOCK: tl.constexpr = 16
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (192*r1)), rmask & xmask, other=0)
tmp13 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 1605632.0
tmp5 = tmp3 / tmp4
tmp6 = 1e-05
tmp7 = tmp5 + tmp6
tmp8 = tl.math.rsqrt(tmp7)
tmp9 = 1.0000006228081046
tmp10 = tmp5 * tmp9
tmp11 = 0.1
tmp12 = tmp10 * tmp11
tmp14 = 0.9
tmp15 = tmp13 * tmp14
tmp16 = tmp12 + tmp15
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask)
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask)
tl.store(out_ptr0 + x0, tmp3, xmask)
def get_args():
arg_0 = rand_strided((1, 192, 1, 1, 13), (2496, 1, 2496, 2496, 192), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_17.run(*args, 192, 13, grid=grid(192), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_17.benchmark_all_configs(*args, 192, 13, grid=grid(192))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/7k/c7k35ubu25j3ryiaxnuo2fjojvhqvy67tox3wtaa4knc5l4vzqby.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.relu, aten.threshold_backward
# aten._native_batch_norm_legit_functional => add_17, add_20, convert_element_type_12, convert_element_type_13, mul_21, mul_27, rsqrt_3, sub_3, var_mean_3
# aten.relu => relu_2
# aten.threshold_backward => le_4
triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_18 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[536870912], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*i1', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_18(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
xnumel = 308281344
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 12544) % 192
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 1605632.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tl.where(0 != 0, 0, tl.where(0 > tmp15, 0, tmp15))
tmp17 = 0.0
tmp18 = tmp16 <= tmp17
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp16, None)
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp18, None)
def get_args():
arg_0 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.bool)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_18.run(*args, 308281344, grid=grid(308281344), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_18.benchmark_all_configs(*args, 308281344, grid=grid(308281344))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/db/cdbk64vwqs4pzrtrb762fxgfklb7q573zm3neisxuhpcneczshvk.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_14
triton_poi_fused__to_copy_19 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_19(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 576
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((64, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((64, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_19.run(*args, 576, grid=grid(576), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_19.benchmark_all_configs(*args, 576, grid=grid(576))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/fq/cfq5oxsb4jr7dbzocvxxr77nzkg2jk4k3rsmhwdpzkoisxiavav5.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_15
triton_poi_fused__to_copy_20 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_20(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1600
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((64, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((64, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_20.run(*args, 1600, grid=grid(1600), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_20.benchmark_all_configs(*args, 1600, grid=grid(1600))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/fo/cfoj5fwz53k7xkfcu6z676snb5mum4wtda3diyiuxkkhrnlissnf.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_16
triton_poi_fused__to_copy_21 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_21(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3136
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((64, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((64, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_21.run(*args, 3136, grid=grid(3136), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_21.benchmark_all_configs(*args, 3136, grid=grid(3136))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ye/cyesscl4aqq7rzyjhll5ppohr4h424gcitducugsmauai5wnvgy5.py
# Original ATen: aten.cat
# aten.cat => cat_1
triton_poi_fused_cat_22 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_22(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 25690112
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 200704
x1 = (xindex // 200704)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (602112*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 64, 56, 56), (200704, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 64, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_22.run(*args, 25690112, grid=grid(25690112), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_22.benchmark_all_configs(*args, 25690112, grid=grid(25690112))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/zm/czmjlzsknic276jj2ulhzekdx7suavkcesp75xqpnfbntqprzsrx.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_17, var_mean_4
triton_red_fused__native_batch_norm_legit_functional_23 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 131072],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_23(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 768
rnumel = 100352
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex % 192
x1 = (xindex // 192)
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = tl.load(in_ptr0 + ((3136*x0) + (602112*(r2 // 3136)) + (19267584*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tl.store(out_ptr0 + x3, tmp2, xmask)
def get_args():
arg_0 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 192, 1, 1, 4), (768, 1, 768, 768, 192), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_23.run(*args, 768, 100352, grid=grid(768), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_23.benchmark_all_configs(*args, 768, 100352, grid=grid(768))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/zp/czpkjhwvqvlhuxsldhiwsf77gmyfxqbtysyyhp7lberdnzehm42l.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_23, convert_element_type_17, mul_29, mul_30, var_mean_4
triton_per_fused__native_batch_norm_legit_functional_24 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[256, 4],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_24(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 192
rnumel = 4
RBLOCK: tl.constexpr = 4
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (192*r1)), rmask & xmask, other=0)
tmp8 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 401408.0
tmp5 = tmp3 / tmp4
tmp6 = 0.1
tmp7 = tmp5 * tmp6
tmp9 = 0.9
tmp10 = tmp8 * tmp9
tmp11 = tmp7 + tmp10
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask)
def get_args():
arg_0 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((1, 192, 1, 1, 4), (768, 1, 768, 768, 192), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_24.run(*args, 192, 4, grid=grid(192), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_24.benchmark_all_configs(*args, 192, 4, grid=grid(192))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/4o/c4ohubk3q7bkigrpxxijrzzlhs5bqi6qcn3fxomltgipjmowqgd2.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_17, var_mean_4
triton_red_fused__native_batch_norm_legit_functional_25 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 131072],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_25(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 768
rnumel = 100352
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex % 192
x1 = (xindex // 192)
tmp2 = tl.load(in_ptr1 + (x0), xmask)
_tmp5 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = tl.load(in_ptr0 + ((3136*x0) + (602112*(r2 // 3136)) + (19267584*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp4 = tmp3 * tmp3
_tmp5 = tl.where(rmask & xmask, _tmp5 + tmp4, _tmp5)
tmp5 = tl.sum(_tmp5, 1)[:, None]
tl.store(out_ptr0 + x3, tmp5, xmask)
def get_args():
arg_0 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 192, 1, 1, 4), (768, 1, 768, 768, 192), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_25.run(*args, 768, 100352, grid=grid(768), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_25.benchmark_all_configs(*args, 768, 100352, grid=grid(768))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/kw/ckworjll2g6m5sbalty5hh27jhwuiyj3iqglt67oatdxz54d3igy.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_22, add_24, convert_element_type_17, mul_31, mul_32, mul_33, rsqrt_4, squeeze_13, var_mean_4
triton_per_fused__native_batch_norm_legit_functional_26 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[256, 4],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_26(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 192
rnumel = 4
RBLOCK: tl.constexpr = 4
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (192*r1)), rmask & xmask, other=0)
tmp13 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 401408.0
tmp5 = tmp3 / tmp4
tmp6 = 1e-05
tmp7 = tmp5 + tmp6
tmp8 = tl.math.rsqrt(tmp7)
tmp9 = 1.0000024912370735
tmp10 = tmp5 * tmp9
tmp11 = 0.1
tmp12 = tmp10 * tmp11
tmp14 = 0.9
tmp15 = tmp13 * tmp14
tmp16 = tmp12 + tmp15
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask)
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask)
tl.store(out_ptr0 + x0, tmp3, xmask)
def get_args():
arg_0 = rand_strided((1, 192, 1, 1, 4), (768, 1, 768, 768, 192), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_26.run(*args, 192, 4, grid=grid(192), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_26.benchmark_all_configs(*args, 192, 4, grid=grid(192))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/rb/crbx6irdjcatfhmyhdhxdmxoun65x4lirw3usotlxxknzb3rnzo3.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.relu, aten.threshold_backward
# aten._native_batch_norm_legit_functional => add_22, add_25, convert_element_type_17, convert_element_type_18, mul_28, mul_34, rsqrt_4, sub_4, var_mean_4
# aten.relu => relu_3
# aten.threshold_backward => le_3
triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_27 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[134217728], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*i1', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_27(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
xnumel = 77070336
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 3136) % 192
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 401408.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tl.where(0 != 0, 0, tl.where(0 > tmp15, 0, tmp15))
tmp17 = 0.0
tmp18 = tmp16 <= tmp17
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp16, None)
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp18, None)
def get_args():
arg_0 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.bool)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_27.run(*args, 77070336, grid=grid(77070336), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_27.benchmark_all_configs(*args, 77070336, grid=grid(77070336))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/yr/cyrrsvkikbwrkovp7p3kp54xrx3kv5k3cuej452rtgigaetgnkzf.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_19
triton_poi_fused__to_copy_28 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_28(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1920
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((20, 96, 1, 1), (96, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((20, 96, 1, 1), (96, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_28.run(*args, 1920, grid=grid(1920), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_28.benchmark_all_configs(*args, 1920, grid=grid(1920))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/lx/clxgqhjnos6ucboracnh3kmrmn5mwuc3sgkffihrftfhuwl64wg2.py
# Original ATen: aten.cat
# aten.cat => cat_2
triton_poi_fused_cat_29 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_29(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 8028160
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 62720
x1 = (xindex // 62720)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (125440*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 20, 56, 56), (62720, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 20, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_29.run(*args, 8028160, grid=grid(8028160), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_29.benchmark_all_configs(*args, 8028160, grid=grid(8028160))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/e3/ce3edr4giy7ygyubb445dsh6wavenlh7yhdxr5odchwzb5blm45m.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_21, var_mean_5
triton_red_fused__native_batch_norm_legit_functional_30 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 32768],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_30(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 520
rnumel = 30878
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x1 = (xindex // 40)
x0 = xindex % 40
_tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = r2 + (30878*x1)
tmp1 = 401408
tmp2 = tmp0 < tmp1
tmp3 = tl.load(in_ptr0 + ((3136*x0) + (125440*(((r2 + (30878*x1)) // 3136) % 128)) + ((r2 + (30878*x1)) % 3136) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp4 = tmp3.to(tl.float32)
tmp5 = tl.where(tmp2, tmp4, 0)
_tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6)
tmp6 = tl.sum(_tmp6, 1)[:, None]
tl.store(out_ptr0 + x3, tmp6, xmask)
def get_args():
arg_0 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 40, 1, 1, 13), (520, 1, 520, 520, 40), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_30.run(*args, 520, 30878, grid=grid(520), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_30.benchmark_all_configs(*args, 520, 30878, grid=grid(520))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/5l/c5ldagjb25qlt4w6hnm2s5f7x7ddcw2evdosdg3pvhtpoduxelhs.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_28, convert_element_type_21, mul_36, mul_37, var_mean_5
triton_per_fused__native_batch_norm_legit_functional_31 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[64, 16],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_31(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 40
rnumel = 13
RBLOCK: tl.constexpr = 16
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (40*r1)), rmask & xmask, other=0)
tmp8 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 401408.0
tmp5 = tmp3 / tmp4
tmp6 = 0.1
tmp7 = tmp5 * tmp6
tmp9 = 0.9
tmp10 = tmp8 * tmp9
tmp11 = tmp7 + tmp10
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask)
def get_args():
arg_0 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((1, 40, 1, 1, 13), (520, 1, 520, 520, 40), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_31.run(*args, 40, 13, grid=grid(40), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_31.benchmark_all_configs(*args, 40, 13, grid=grid(40))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ko/ckomyuegfrpkii72qwrquhfy6eh53az7jeqam3ghl4ajcmcw2h3l.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_21, var_mean_5
triton_red_fused__native_batch_norm_legit_functional_32 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 32768],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_32(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 520
rnumel = 30878
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x1 = (xindex // 40)
x0 = xindex % 40
_tmp9 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = r2 + (30878*x1)
tmp1 = 401408
tmp2 = tmp0 < tmp1
tmp3 = tl.load(in_ptr0 + ((3136*x0) + (125440*(((r2 + (30878*x1)) // 3136) % 128)) + ((r2 + (30878*x1)) % 3136) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp4 = tmp3.to(tl.float32)
tmp5 = tl.load(in_ptr1 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0)
tmp6 = tmp4 - tmp5
tmp7 = tmp6 * tmp6
tmp8 = tl.where(tmp2, tmp7, 0)
_tmp9 = tl.where(rmask & xmask, _tmp9 + tmp8, _tmp9)
tmp9 = tl.sum(_tmp9, 1)[:, None]
tl.store(out_ptr0 + x3, tmp9, xmask)
def get_args():
arg_0 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 40, 1, 1, 13), (520, 1, 520, 520, 40), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_32.run(*args, 520, 30878, grid=grid(520), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_32.benchmark_all_configs(*args, 520, 30878, grid=grid(520))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/5p/c5pcmnlklk6a6ee2vm3ta2iounjyiftuynwfzbts7sayeuto4jvn.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_27, add_29, convert_element_type_21, mul_38, mul_39, mul_40, rsqrt_5, squeeze_16, var_mean_5
triton_per_fused__native_batch_norm_legit_functional_33 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[64, 16],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_33(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 40
rnumel = 13
RBLOCK: tl.constexpr = 16
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (40*r1)), rmask & xmask, other=0)
tmp13 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 401408.0
tmp5 = tmp3 / tmp4
tmp6 = 1e-05
tmp7 = tmp5 + tmp6
tmp8 = tl.math.rsqrt(tmp7)
tmp9 = 1.0000024912370735
tmp10 = tmp5 * tmp9
tmp11 = 0.1
tmp12 = tmp10 * tmp11
tmp14 = 0.9
tmp15 = tmp13 * tmp14
tmp16 = tmp12 + tmp15
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask)
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask)
tl.store(out_ptr0 + x0, tmp3, xmask)
def get_args():
arg_0 = rand_strided((1, 40, 1, 1, 13), (520, 1, 520, 520, 40), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_33.run(*args, 40, 13, grid=grid(40), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_33.benchmark_all_configs(*args, 40, 13, grid=grid(40))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/pf/cpfbdly4ax5zztymk42pnezqf7ys2im4sb437kjrqjpf4x2jbotn.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_27, add_30, convert_element_type_21, convert_element_type_22, mul_35, mul_41, rsqrt_5, sub_5, var_mean_5
triton_poi_fused__native_batch_norm_legit_functional_34 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_34(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16056320
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 3136) % 40
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 401408.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None)
def get_args():
arg_0 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_34.run(*args, 16056320, grid=grid(16056320), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_34.benchmark_all_configs(*args, 16056320, grid=grid(16056320))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/jt/cjtdu5nq2b37l5fyi5c7vdk3m4aqhqeqbsw3e5ifo4u74lo5bgaq.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_23
triton_poi_fused__to_copy_35 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_35(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1200
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((60, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((60, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_35.run(*args, 1200, grid=grid(1200), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_35.benchmark_all_configs(*args, 1200, grid=grid(1200))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/wj/cwjg3r37ehsowju7qerrsqlothtrltxyyuq5mhnulbp7zor3ocw5.py
# Original ATen: aten.cat
# aten.cat => cat_3
triton_poi_fused_cat_36 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_36(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 24084480
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 188160
x1 = (xindex // 188160)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (376320*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 60, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_36.run(*args, 24084480, grid=grid(24084480), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_36.benchmark_all_configs(*args, 24084480, grid=grid(24084480))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/2c/c2cmgfhoarotsw2klaplncsqdxdxrbc3je4bm4idg4672cm2hbzh.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_25, var_mean_6
triton_red_fused__native_batch_norm_legit_functional_37 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 131072],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_37(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 480
rnumel = 100352
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex % 120
x1 = (xindex // 120)
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = tl.load(in_ptr0 + ((3136*x0) + (376320*(r2 // 3136)) + (12042240*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tl.store(out_ptr0 + x3, tmp2, xmask)
def get_args():
arg_0 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 120, 1, 1, 4), (480, 1, 480, 480, 120), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_37.run(*args, 480, 100352, grid=grid(480), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_37.benchmark_all_configs(*args, 480, 100352, grid=grid(480))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/xq/cxq23fscob3jbjdcsao7sicyvado5nsvykscbw7syq3dh73swjgu.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_33, convert_element_type_25, mul_43, mul_44, var_mean_6
triton_per_fused__native_batch_norm_legit_functional_38 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[128, 4],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_38(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 120
rnumel = 4
RBLOCK: tl.constexpr = 4
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (120*r1)), rmask & xmask, other=0)
tmp8 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 401408.0
tmp5 = tmp3 / tmp4
tmp6 = 0.1
tmp7 = tmp5 * tmp6
tmp9 = 0.9
tmp10 = tmp8 * tmp9
tmp11 = tmp7 + tmp10
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask)
def get_args():
arg_0 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((1, 120, 1, 1, 4), (480, 1, 480, 480, 120), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_38.run(*args, 120, 4, grid=grid(120), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_38.benchmark_all_configs(*args, 120, 4, grid=grid(120))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/wg/cwgir4s67j2t35f66rtqguw4skzrnzaazh3rvukzvjln6bwfquwz.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_25, var_mean_6
triton_red_fused__native_batch_norm_legit_functional_39 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 131072],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_39(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 480
rnumel = 100352
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex % 120
x1 = (xindex // 120)
tmp2 = tl.load(in_ptr1 + (x0), xmask)
_tmp5 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = tl.load(in_ptr0 + ((3136*x0) + (376320*(r2 // 3136)) + (12042240*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp4 = tmp3 * tmp3
_tmp5 = tl.where(rmask & xmask, _tmp5 + tmp4, _tmp5)
tmp5 = tl.sum(_tmp5, 1)[:, None]
tl.store(out_ptr0 + x3, tmp5, xmask)
def get_args():
arg_0 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 120, 1, 1, 4), (480, 1, 480, 480, 120), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_39.run(*args, 480, 100352, grid=grid(480), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_39.benchmark_all_configs(*args, 480, 100352, grid=grid(480))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/bd/cbdjjrt2eltvde4ay5q7ftrif7dzdrcgxsonoef3wv6a6n446fg5.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_32, add_34, convert_element_type_25, mul_45, mul_46, mul_47, rsqrt_6, squeeze_19, var_mean_6
triton_per_fused__native_batch_norm_legit_functional_40 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[128, 4],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_40(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 120
rnumel = 4
RBLOCK: tl.constexpr = 4
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (120*r1)), rmask & xmask, other=0)
tmp13 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 401408.0
tmp5 = tmp3 / tmp4
tmp6 = 1e-05
tmp7 = tmp5 + tmp6
tmp8 = tl.math.rsqrt(tmp7)
tmp9 = 1.0000024912370735
tmp10 = tmp5 * tmp9
tmp11 = 0.1
tmp12 = tmp10 * tmp11
tmp14 = 0.9
tmp15 = tmp13 * tmp14
tmp16 = tmp12 + tmp15
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask)
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask)
tl.store(out_ptr0 + x0, tmp3, xmask)
def get_args():
arg_0 = rand_strided((1, 120, 1, 1, 4), (480, 1, 480, 480, 120), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_40.run(*args, 120, 4, grid=grid(120), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_40.benchmark_all_configs(*args, 120, 4, grid=grid(120))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ff/cffbiktpj5b36houxciovjfe3g7ji5w5ibbvgjpjdr6m6dxnqiod.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.relu
# aten._native_batch_norm_legit_functional => add_32, add_35, convert_element_type_25, convert_element_type_26, mul_42, mul_48, rsqrt_6, sub_6, var_mean_6
# aten.relu => relu_4
triton_poi_fused__native_batch_norm_legit_functional_relu_41 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_relu_41(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 48168960
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 3136) % 120
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 401408.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tl.where(0 != 0, 0, tl.where(0 > tmp15, 0, tmp15))
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp16, None)
def get_args():
arg_0 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_relu_41.run(*args, 48168960, grid=grid(48168960), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_relu_41.benchmark_all_configs(*args, 48168960, grid=grid(48168960))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/6y/c6ynio5pu6llppeohanybeuevixt5rgpdgxw24ylgggmbgzjacrt.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_27
triton_poi_fused__to_copy_42 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_42(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1080
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_42.run(*args, 1080, grid=grid(1080), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_42.benchmark_all_configs(*args, 1080, grid=grid(1080))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/p3/cp3ysrijswutvs72wkv4ypuaradvydgclkx6swvsdiolmqtydjck.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.relu, aten.threshold_backward
# aten._native_batch_norm_legit_functional => add_37, add_40, convert_element_type_28, convert_element_type_29, mul_49, mul_55, rsqrt_7, sub_7, var_mean_7
# aten.relu => relu_5
# aten.threshold_backward => le_1
triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_43 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*i1', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_43(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
xnumel = 48168960
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 3136) % 120
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 401408.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tl.where(0 != 0, 0, tl.where(0 > tmp15, 0, tmp15))
tmp17 = 0.0
tmp18 = tmp16 <= tmp17
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp16, None)
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp18, None)
def get_args():
arg_0 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.bool)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_43.run(*args, 48168960, grid=grid(48168960), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_43.benchmark_all_configs(*args, 48168960, grid=grid(48168960))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/pe/cpea2gtzlc2kbhqh3fj4j3lqcnuxq36vpr46nhhxpvtecppidlkn.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_30
triton_poi_fused__to_copy_44 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_44(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1200
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((20, 60, 1, 1), (60, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((20, 60, 1, 1), (60, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_44.run(*args, 1200, grid=grid(1200), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_44.benchmark_all_configs(*args, 1200, grid=grid(1200))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/nm/cnmhyqb22hbicwkw553bytawf6yat3hnc5u573hgyyuw5vi2ohc2.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add
# aten._native_batch_norm_legit_functional => add_42, add_45, convert_element_type_32, convert_element_type_33, mul_56, mul_62, rsqrt_8, sub_8, var_mean_8
# aten.add => add_46
triton_poi_fused__native_batch_norm_legit_functional_add_45 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_45(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16056320
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 3136) % 40
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp16 = tl.load(in_ptr5 + (x3), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 401408.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp17 = tmp15 + tmp16
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp17, None)
def get_args():
arg_0 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_45.run(*args, 16056320, grid=grid(16056320), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_45.benchmark_all_configs(*args, 16056320, grid=grid(16056320))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/d6/cd6uk6iyi5u7h7amewzycxkyxriny4cknptmukvtlpxi4jwlerzf.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_34
triton_poi_fused__to_copy_46 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_46(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 9600
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((240, 40, 1, 1), (40, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((240, 40, 1, 1), (40, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_46.run(*args, 9600, grid=grid(9600), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_46.benchmark_all_configs(*args, 9600, grid=grid(9600))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/p3/cp3dwfqdvlpfdvfvxrfm2lik34mpytuijuqsswm7efzikdczro7d.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_48, add_49, add_50, convert_element_type_35, mul_64, mul_65, mul_66, mul_67, mul_68, rsqrt_9, squeeze_28, var_mean_9
triton_red_fused__native_batch_norm_legit_functional_47 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[256, 524288],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_47(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 240
rnumel = 401408
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 3136
r2 = (rindex // 3136)
tmp0 = tl.load(in_ptr0 + (r1 + (3136*x0) + (752640*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tmp7 = tl.load(in_ptr1 + (x0), xmask)
tmp3 = 401408.0
tmp4 = tmp2 / tmp3
tmp5 = 0.1
tmp6 = tmp4 * tmp5
tmp8 = 0.9
tmp9 = tmp7 * tmp8
tmp10 = tmp6 + tmp9
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 3136
r2 = (rindex // 3136)
tmp11 = tl.load(in_ptr0 + (r1 + (3136*x0) + (752640*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp12 = tmp11.to(tl.float32)
tmp13 = tmp12 - tmp4
tmp14 = tmp13 * tmp13
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
tmp15 = tl.sum(_tmp15, 1)[:, None]
tl.store(out_ptr1 + x0, tmp15, xmask)
tmp25 = tl.load(in_ptr2 + (x0), xmask)
tmp16 = 401408.0
tmp17 = tmp15 / tmp16
tmp18 = 1e-05
tmp19 = tmp17 + tmp18
tmp20 = tl.math.rsqrt(tmp19)
tmp21 = 1.0000024912370735
tmp22 = tmp17 * tmp21
tmp23 = 0.1
tmp24 = tmp22 * tmp23
tmp26 = 0.9
tmp27 = tmp25 * tmp26
tmp28 = tmp24 + tmp27
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)
def get_args():
arg_0 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_47.run(*args, 240, 401408, grid=grid(240), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_47.benchmark_all_configs(*args, 240, 401408, grid=grid(240))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/bn/cbnwrcxmnryz6qd2vzr5qnxb2q3vepfvhupqz2awsu4zkworteie.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub
# aten._native_batch_norm_legit_functional => add_48, add_51, convert_element_type_35, convert_element_type_36, mul_63, mul_69, rsqrt_9, sub_9, var_mean_9
# aten.add => add_379
# aten.clone => clone
# aten.fill => full_like_47
# aten.mul => mul_1107, mul_1108
# aten.sigmoid => sigmoid_111
# aten.sub => sub_313
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_48 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[134217728], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_48(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
xnumel = 96337920
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 3136) % 240
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 401408.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tl.sigmoid(tmp15)
tmp17 = 1.0
tmp18 = tmp17 - tmp16
tmp19 = tmp15 * tmp18
tmp20 = tmp19 + tmp17
tmp21 = tmp16 * tmp20
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None)
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp21, None)
def get_args():
arg_0 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_48.run(*args, 96337920, grid=grid(96337920), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_48.benchmark_all_configs(*args, 96337920, grid=grid(96337920))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/tr/ctrlm7wxwgyqu6igydzh2qy3dgfyvkbxyf6ed5dmhs5ac43nkywg.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_39
triton_poi_fused__to_copy_49 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_49(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 540
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((60, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((60, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_49.run(*args, 540, grid=grid(540), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_49.benchmark_all_configs(*args, 540, grid=grid(540))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/62/c62vp5bgz7fqdu2x3oxuvrnwwvzztezlji4zzzxqegy77wuqckqo.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_52
triton_poi_fused_split_with_sizes_50 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_50(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 24084480
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 188160
x1 = (xindex // 188160)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (752640*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_50.run(*args, 24084480, grid=grid(24084480), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_50.benchmark_all_configs(*args, 24084480, grid=grid(24084480))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/5k/c5k2puyq6twbirobmys3yfadgomf23ufkkjsshzn37upfh5g7erl.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_40
triton_poi_fused__to_copy_51 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_51(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1500
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((60, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((60, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_51.run(*args, 1500, grid=grid(1500), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_51.benchmark_all_configs(*args, 1500, grid=grid(1500))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/qq/cqqmootvvozbchj4wyxl2cmg4iebq6hs2v2baytcez4tdix7pywu.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_57
triton_poi_fused_split_with_sizes_52 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_52(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 24084480
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 188160
x1 = (xindex // 188160)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (188160 + x0 + (752640*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_52.run(*args, 24084480, grid=grid(24084480), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_52.benchmark_all_configs(*args, 24084480, grid=grid(24084480))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/hn/chn7fi5d6pb7lt74jnnjikqswlxeu3nbv3mpu3aidnxkwdaw4fmq.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_41
triton_poi_fused__to_copy_53 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_53(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2940
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((60, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((60, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_53.run(*args, 2940, grid=grid(2940), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_53.benchmark_all_configs(*args, 2940, grid=grid(2940))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/7d/c7d6lruapdg7jujcavuay77hawcbzn2rk3uqz6opu54tj6rxd3dc.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_62
triton_poi_fused_split_with_sizes_54 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_54(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 24084480
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 188160
x1 = (xindex // 188160)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (376320 + x0 + (752640*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_54.run(*args, 24084480, grid=grid(24084480), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_54.benchmark_all_configs(*args, 24084480, grid=grid(24084480))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ih/cihdtphelycdnabls3bvxcjqdwyigw5444zlobgvmbwdwzxq7zdy.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_42
triton_poi_fused__to_copy_55 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_55(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4860
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((60, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((60, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_55.run(*args, 4860, grid=grid(4860), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_55.benchmark_all_configs(*args, 4860, grid=grid(4860))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/oq/coqx7tfdhgzsd3kzw3ueczpktm7gxo5z67ciilgh24ffay7herao.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_67
triton_poi_fused_split_with_sizes_56 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_56(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 24084480
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 188160
x1 = (xindex // 188160)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (564480 + x0 + (752640*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_56.run(*args, 24084480, grid=grid(24084480), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_56.benchmark_all_configs(*args, 24084480, grid=grid(24084480))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/n6/cn6lwm2ninzuud2fuu3omrklbeewaxqtkv3v6gdaxmikjvyoi525.py
# Original ATen: aten.cat
# aten.cat => cat_5
triton_poi_fused_cat_57 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_57(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 6021120
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 47040
x1 = (xindex // 47040)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (188160*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 60, 28, 28), (47040, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 60, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_57.run(*args, 6021120, grid=grid(6021120), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_57.benchmark_all_configs(*args, 6021120, grid=grid(6021120))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/tm/ctmfxwnpwpks3pe3wok54jftmfykafzupq5buoissf32gfcrrq6d.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_53, add_54, add_55, convert_element_type_43, mul_72, mul_73, mul_74, mul_75, mul_76, rsqrt_10, squeeze_31, var_mean_10
triton_red_fused__native_batch_norm_legit_functional_58 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[256, 131072],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_58(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 240
rnumel = 100352
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 784
r2 = (rindex // 784)
tmp0 = tl.load(in_ptr0 + (r1 + (784*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tmp7 = tl.load(in_ptr1 + (x0), xmask)
tmp3 = 100352.0
tmp4 = tmp2 / tmp3
tmp5 = 0.1
tmp6 = tmp4 * tmp5
tmp8 = 0.9
tmp9 = tmp7 * tmp8
tmp10 = tmp6 + tmp9
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 784
r2 = (rindex // 784)
tmp11 = tl.load(in_ptr0 + (r1 + (784*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp12 = tmp11.to(tl.float32)
tmp13 = tmp12 - tmp4
tmp14 = tmp13 * tmp13
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
tmp15 = tl.sum(_tmp15, 1)[:, None]
tl.store(out_ptr1 + x0, tmp15, xmask)
tmp25 = tl.load(in_ptr2 + (x0), xmask)
tmp16 = 100352.0
tmp17 = tmp15 / tmp16
tmp18 = 1e-05
tmp19 = tmp17 + tmp18
tmp20 = tl.math.rsqrt(tmp19)
tmp21 = 1.00000996502277
tmp22 = tmp17 * tmp21
tmp23 = 0.1
tmp24 = tmp22 * tmp23
tmp26 = 0.9
tmp27 = tmp25 * tmp26
tmp28 = tmp24 + tmp27
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)
def get_args():
arg_0 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_58.run(*args, 240, 100352, grid=grid(240), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_58.benchmark_all_configs(*args, 240, 100352, grid=grid(240))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/tf/ctf52h6qjyfnbwd7tzysnt6o3dvwaedcis63siipfnbkcfkukqa7.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu
# aten._native_batch_norm_legit_functional => add_53, add_56, convert_element_type_43, convert_element_type_44, mul_71, mul_77, rsqrt_10, sub_10, var_mean_10
# aten.mean => mean
# aten.silu => convert_element_type_45, convert_element_type_46, mul_78, sigmoid_1
triton_per_fused__native_batch_norm_legit_functional_mean_silu_59 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[32768, 1024],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_mean_silu_59(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 30720
rnumel = 784
RBLOCK: tl.constexpr = 1024
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r2 = rindex
x3 = xindex
x0 = xindex % 240
tmp0 = tl.load(in_ptr0 + (r2 + (784*x3)), rmask, other=0).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x0), None)
tmp4 = tl.load(in_ptr2 + (x0), None)
tmp11 = tl.load(in_ptr3 + (x0), None)
tmp13 = tl.load(in_ptr4 + (x0), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 100352.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tmp15.to(tl.float32)
tmp17 = tl.sigmoid(tmp16)
tmp18 = tmp16 * tmp17
tmp19 = tmp18.to(tl.float32)
tmp20 = tmp19.to(tl.float32)
tmp22 = tl.where(rmask, tmp20, 0)
tmp23 = tl.sum(tmp22, 1)[:, None]
tmp24 = 784.0
tmp25 = tmp23 / tmp24
tmp26 = tmp25.to(tl.float32)
tl.store(out_ptr0 + (r2 + (784*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask)
tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None)
def get_args():
arg_0 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_mean_silu_59.run(*args, 30720, 784, grid=grid(30720), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_mean_silu_59.benchmark_all_configs(*args, 30720, 784, grid=grid(30720))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/l4/cl43m5yr7mr6rofdxkde5hoinowoes7zmspib6uw4wfhtoztyd2j.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_48
triton_poi_fused__to_copy_60 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_60(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4800
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((20, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((20, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_60.run(*args, 4800, grid=grid(4800), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_60.benchmark_all_configs(*args, 4800, grid=grid(4800))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/n3/cn3yrrvjz2rkdvv5cbgdyj5mk5wgjpljfxyga75evi5tqahwagdd.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_47
# aten.convolution => convolution_20
triton_poi_fused__to_copy_convolution_61 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_61(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 20
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((20,), (1,), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((20,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_61.run(*args, 20, grid=grid(20), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_61.benchmark_all_configs(*args, 20, grid=grid(20))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/nz/cnz5cjjblhni4cibgggtjl3kn4ksihmqz6znyn5belgi4q43cncw.py
# Original ATen: aten._to_copy, aten.convolution, aten.silu
# aten._to_copy => convert_element_type_47
# aten.convolution => convolution_20
# aten.silu => convert_element_type_49, convert_element_type_50, mul_79, sigmoid_2
triton_poi_fused__to_copy_convolution_silu_62 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_silu_62(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2560
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 20
tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp2 = tmp0 + tmp1
tmp3 = tmp2.to(tl.float32)
tmp4 = tl.sigmoid(tmp3)
tmp5 = tmp3 * tmp4
tmp6 = tmp5.to(tl.float32)
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, xmask)
def get_args():
arg_0 = rand_strided((128, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((20,), (1,), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_silu_62.run(*args, 2560, grid=grid(2560), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_silu_62.benchmark_all_configs(*args, 2560, grid=grid(2560))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/bl/cblr4zz2jdn5lx7fyq6hto375tcressl73jxrxvozhqbubkoilua.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_52
triton_poi_fused__to_copy_63 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_63(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4800
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((240, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((240, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_63.run(*args, 4800, grid=grid(4800), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_63.benchmark_all_configs(*args, 4800, grid=grid(4800))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/gq/cgqatezdistnqv4yyabd7t76wkyduditccoaqjd3ef4npoxk6vrv.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_51
# aten.convolution => convolution_21
triton_poi_fused__to_copy_convolution_64 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[256], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_64(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 240
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_64.run(*args, 240, grid=grid(240), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_64.benchmark_all_configs(*args, 240, grid=grid(240))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/qr/cqrlwq46vnra4itz3o4m3cvwrnyl7srsa7xlwywcqzni2rafclhf.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_51
# aten.convolution => convolution_21
triton_poi_fused__to_copy_convolution_65 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_65(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 30720
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 240
tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
tmp2 = tmp0 + tmp1
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None)
def get_args():
arg_0 = rand_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_65.run(*args, 30720, grid=grid(30720), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_65.benchmark_all_configs(*args, 30720, grid=grid(30720))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ru/crubekjujrhilkb2spkvdoiiitif5a5cisikyju6vjbn7j3itjum.py
# Original ATen: aten.mul, aten.sigmoid, aten.silu
# aten.mul => mul_80
# aten.sigmoid => sigmoid_3
# aten.silu => convert_element_type_45, convert_element_type_46, mul_78, sigmoid_1
triton_poi_fused_mul_sigmoid_silu_66 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused_mul_sigmoid_silu_66(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 24084480
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x1 = (xindex // 784)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tmp6 = tl.sigmoid(tmp5)
tmp7 = tmp4 * tmp6
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None)
def get_args():
arg_0 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_mul_sigmoid_silu_66.run(*args, 24084480, grid=grid(24084480), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_mul_sigmoid_silu_66.benchmark_all_configs(*args, 24084480, grid=grid(24084480))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/va/cvacncosevkh7a4e4mjytw5gvejhihlmvkwma42dwu2nfhypnbzr.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_53
triton_poi_fused__to_copy_67 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_67(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 13440
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((56, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((56, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_67.run(*args, 13440, grid=grid(13440), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_67.benchmark_all_configs(*args, 13440, grid=grid(13440))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/hv/chvkvxc6xaoyzdfyhbbbolrujmg4wuzt7gkh6x4thkny6u2hr4gg.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_54, var_mean_11
triton_red_fused__native_batch_norm_legit_functional_68 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_68(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 728
rnumel = 7720
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x1 = (xindex // 56)
x0 = xindex % 56
_tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = r2 + (7720*x1)
tmp1 = 100352
tmp2 = tmp0 < tmp1
tmp3 = tl.load(in_ptr0 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp4 = tmp3.to(tl.float32)
tmp5 = tl.where(tmp2, tmp4, 0)
_tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6)
tmp6 = tl.sum(_tmp6, 1)[:, None]
tl.store(out_ptr0 + x3, tmp6, xmask)
def get_args():
arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 56, 1, 1, 13), (728, 1, 728, 728, 56), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_68.run(*args, 728, 7720, grid=grid(728), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_68.benchmark_all_configs(*args, 728, 7720, grid=grid(728))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ih/cihgiw3fxfncw6wmlie2fnq4egrnxl6aq2aj37mt4r5kpoemk2yw.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_59, convert_element_type_54, mul_82, mul_83, var_mean_11
triton_per_fused__native_batch_norm_legit_functional_69 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[64, 16],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_69(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 56
rnumel = 13
RBLOCK: tl.constexpr = 16
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (56*r1)), rmask & xmask, other=0)
tmp8 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 100352.0
tmp5 = tmp3 / tmp4
tmp6 = 0.1
tmp7 = tmp5 * tmp6
tmp9 = 0.9
tmp10 = tmp8 * tmp9
tmp11 = tmp7 + tmp10
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask)
def get_args():
arg_0 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((1, 56, 1, 1, 13), (728, 1, 728, 728, 56), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_69.run(*args, 56, 13, grid=grid(56), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_69.benchmark_all_configs(*args, 56, 13, grid=grid(56))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/n3/cn3m3v5dwzlicmheht7t2sqzrzgtbufmioizlp6yemcka6kewuok.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_54, var_mean_11
triton_red_fused__native_batch_norm_legit_functional_70 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_70(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 728
rnumel = 7720
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x1 = (xindex // 56)
x0 = xindex % 56
_tmp9 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = r2 + (7720*x1)
tmp1 = 100352
tmp2 = tmp0 < tmp1
tmp3 = tl.load(in_ptr0 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp4 = tmp3.to(tl.float32)
tmp5 = tl.load(in_ptr1 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0)
tmp6 = tmp4 - tmp5
tmp7 = tmp6 * tmp6
tmp8 = tl.where(tmp2, tmp7, 0)
_tmp9 = tl.where(rmask & xmask, _tmp9 + tmp8, _tmp9)
tmp9 = tl.sum(_tmp9, 1)[:, None]
tl.store(out_ptr0 + x3, tmp9, xmask)
def get_args():
arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 56, 1, 1, 13), (728, 1, 728, 728, 56), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_70.run(*args, 728, 7720, grid=grid(728), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_70.benchmark_all_configs(*args, 728, 7720, grid=grid(728))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/xv/cxvq5i34k2bihobv4ebll2eevqva3mot7bow45ecoiyqxocbuesz.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_58, add_60, convert_element_type_54, mul_84, mul_85, mul_86, rsqrt_11, squeeze_34, var_mean_11
triton_per_fused__native_batch_norm_legit_functional_71 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[64, 16],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_71(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 56
rnumel = 13
RBLOCK: tl.constexpr = 16
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (56*r1)), rmask & xmask, other=0)
tmp13 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 100352.0
tmp5 = tmp3 / tmp4
tmp6 = 1e-05
tmp7 = tmp5 + tmp6
tmp8 = tl.math.rsqrt(tmp7)
tmp9 = 1.00000996502277
tmp10 = tmp5 * tmp9
tmp11 = 0.1
tmp12 = tmp10 * tmp11
tmp14 = 0.9
tmp15 = tmp13 * tmp14
tmp16 = tmp12 + tmp15
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask)
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask)
tl.store(out_ptr0 + x0, tmp3, xmask)
def get_args():
arg_0 = rand_strided((1, 56, 1, 1, 13), (728, 1, 728, 728, 56), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_71.run(*args, 56, 13, grid=grid(56), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_71.benchmark_all_configs(*args, 56, 13, grid=grid(56))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/jw/cjwfycxx2mrgx2fsw6tfls7yyigmo7hmavgpkr2ob23aopbzsnjc.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_58, add_61, convert_element_type_54, convert_element_type_55, mul_81, mul_87, rsqrt_11, sub_11, var_mean_11
triton_poi_fused__native_batch_norm_legit_functional_72 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_72(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 5619712
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 784) % 56
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 100352.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None)
def get_args():
arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_72.run(*args, 5619712, grid=grid(5619712), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_72.benchmark_all_configs(*args, 5619712, grid=grid(5619712))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/7e/c7eyndtglwpd6eg2g74cubgx2m45rn7sikd7mb4t4ucitntos24j.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_56
triton_poi_fused__to_copy_73 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_73(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4704
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_73.run(*args, 4704, grid=grid(4704), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_73.benchmark_all_configs(*args, 4704, grid=grid(4704))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/g7/cg7zj3gxcfnefxp4hvh24f2ahem4qdf4cwmaspxgokny23kr7rdc.py
# Original ATen: aten.cat
# aten.cat => cat_6
triton_poi_fused_cat_74 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_74(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16859136
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 131712
x1 = (xindex // 131712)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (263424*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 168, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_74.run(*args, 16859136, grid=grid(16859136), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_74.benchmark_all_configs(*args, 16859136, grid=grid(16859136))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ea/cea2p3bterhsxcizlwjuqgqdcdagptpjutjfoisqqtcxx4bo46r2.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_63, add_64, add_65, convert_element_type_58, mul_89, mul_90, mul_91, mul_92, mul_93, rsqrt_12, squeeze_37, var_mean_12
triton_red_fused__native_batch_norm_legit_functional_75 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 131072],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_75(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 336
rnumel = 100352
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 784
r2 = (rindex // 784)
tmp0 = tl.load(in_ptr0 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tmp7 = tl.load(in_ptr1 + (x0), xmask)
tmp3 = 100352.0
tmp4 = tmp2 / tmp3
tmp5 = 0.1
tmp6 = tmp4 * tmp5
tmp8 = 0.9
tmp9 = tmp7 * tmp8
tmp10 = tmp6 + tmp9
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 784
r2 = (rindex // 784)
tmp11 = tl.load(in_ptr0 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp12 = tmp11.to(tl.float32)
tmp13 = tmp12 - tmp4
tmp14 = tmp13 * tmp13
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
tmp15 = tl.sum(_tmp15, 1)[:, None]
tl.store(out_ptr1 + x0, tmp15, xmask)
tmp25 = tl.load(in_ptr2 + (x0), xmask)
tmp16 = 100352.0
tmp17 = tmp15 / tmp16
tmp18 = 1e-05
tmp19 = tmp17 + tmp18
tmp20 = tl.math.rsqrt(tmp19)
tmp21 = 1.00000996502277
tmp22 = tmp17 * tmp21
tmp23 = 0.1
tmp24 = tmp22 * tmp23
tmp26 = 0.9
tmp27 = tmp25 * tmp26
tmp28 = tmp24 + tmp27
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)
def get_args():
arg_0 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_75.run(*args, 336, 100352, grid=grid(336), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_75.benchmark_all_configs(*args, 336, 100352, grid=grid(336))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/3o/c3odnwpkqtnjm6osw5x7eu5soiuoiv27n2m7cqobgwum2f4c5rgk.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub
# aten._native_batch_norm_legit_functional => add_63, add_66, convert_element_type_58, convert_element_type_59, mul_88, mul_94, rsqrt_12, sub_12, var_mean_12
# aten.add => add_374
# aten.clone => clone_3
# aten.fill => full_like_44
# aten.mul => mul_1067, mul_1068
# aten.sigmoid => sigmoid_108
# aten.sub => sub_297
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
xnumel = 33718272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 784) % 336
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 100352.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tl.sigmoid(tmp15)
tmp17 = 1.0
tmp18 = tmp17 - tmp16
tmp19 = tmp15 * tmp18
tmp20 = tmp19 + tmp17
tmp21 = tmp16 * tmp20
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None)
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp21, None)
def get_args():
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76.run(*args, 33718272, grid=grid(33718272), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76.benchmark_all_configs(*args, 33718272, grid=grid(33718272))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/nw/cnwbbjysyivhwwej3cvtzavdj6jvinxuti5ndhcg3qzuf77glcrt.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_62
triton_poi_fused__to_copy_77 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_77(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1512
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((168, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((168, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_77.run(*args, 1512, grid=grid(1512), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_77.benchmark_all_configs(*args, 1512, grid=grid(1512))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/fz/cfzmnx3o626lf5s63xg32x6is4tmygw4nxdhdigpofuif52vxrzm.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_78
triton_poi_fused_split_with_sizes_78 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_78(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16859136
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 131712
x1 = (xindex // 131712)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (263424*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_78.run(*args, 16859136, grid=grid(16859136), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_78.benchmark_all_configs(*args, 16859136, grid=grid(16859136))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/2r/c2rv7mzu7my2opxvcm4z3utkh7llovacnk6pwuzns4ehc3sgey2e.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_63
triton_poi_fused__to_copy_79 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_79(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4200
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((168, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((168, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_79.run(*args, 4200, grid=grid(4200), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_79.benchmark_all_configs(*args, 4200, grid=grid(4200))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/pt/cptqmg7k6n2us7i5wywz2bwg6fjwqyaknishdkved5rkdpml5nw5.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_81
triton_poi_fused_split_with_sizes_80 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_80(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16859136
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 131712
x1 = (xindex // 131712)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (131712 + x0 + (263424*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_80.run(*args, 16859136, grid=grid(16859136), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_80.benchmark_all_configs(*args, 16859136, grid=grid(16859136))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/jj/cjj6wuzjfwt7xigucunb4hdetjt4mr4ldxzq5ab4kxddbetbjdu4.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu
# aten._native_batch_norm_legit_functional => add_68, add_71, convert_element_type_64, convert_element_type_65, mul_102, mul_96, rsqrt_13, sub_13, var_mean_13
# aten.mean => mean_1
# aten.silu => convert_element_type_66, convert_element_type_67, mul_103, sigmoid_5
triton_per_fused__native_batch_norm_legit_functional_mean_silu_81 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[65536, 1024],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_mean_silu_81(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 43008
rnumel = 784
RBLOCK: tl.constexpr = 1024
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r2 = rindex
x3 = xindex
x0 = xindex % 336
tmp0 = tl.load(in_ptr0 + (r2 + (784*x3)), rmask, other=0).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x0), None)
tmp4 = tl.load(in_ptr2 + (x0), None)
tmp11 = tl.load(in_ptr3 + (x0), None)
tmp13 = tl.load(in_ptr4 + (x0), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 100352.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tmp15.to(tl.float32)
tmp17 = tl.sigmoid(tmp16)
tmp18 = tmp16 * tmp17
tmp19 = tmp18.to(tl.float32)
tmp20 = tmp19.to(tl.float32)
tmp22 = tl.where(rmask, tmp20, 0)
tmp23 = tl.sum(tmp22, 1)[:, None]
tmp24 = 784.0
tmp25 = tmp23 / tmp24
tmp26 = tmp25.to(tl.float32)
tl.store(out_ptr0 + (r2 + (784*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask)
tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None)
def get_args():
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_mean_silu_81.run(*args, 43008, 784, grid=grid(43008), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_mean_silu_81.benchmark_all_configs(*args, 43008, 784, grid=grid(43008))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/qi/cqigsniw4k2557tsg6szy4sxc76yhclqkbczo3sje6z3gkbkxim2.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_69
triton_poi_fused__to_copy_82 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_82(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 9408
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((28, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((28, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_82.run(*args, 9408, grid=grid(9408), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_82.benchmark_all_configs(*args, 9408, grid=grid(9408))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/qr/cqrzuzcrjaui22653nognmjx7khvfhrb54plybvtrgjuggwhd7ca.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_68
# aten.convolution => convolution_27
triton_poi_fused__to_copy_convolution_83 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_83(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 28
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((28,), (1,), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((28,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_83.run(*args, 28, grid=grid(28), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_83.benchmark_all_configs(*args, 28, grid=grid(28))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/5j/c5jenet4bcjhmp6nmplvispvr7rwhrt4fonprb4atbj2oqpb6yhc.py
# Original ATen: aten._to_copy, aten.convolution, aten.silu
# aten._to_copy => convert_element_type_68
# aten.convolution => convolution_27
# aten.silu => convert_element_type_70, convert_element_type_71, mul_104, sigmoid_6
triton_poi_fused__to_copy_convolution_silu_84 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_silu_84(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3584
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 28
tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp2 = tmp0 + tmp1
tmp3 = tmp2.to(tl.float32)
tmp4 = tl.sigmoid(tmp3)
tmp5 = tmp3 * tmp4
tmp6 = tmp5.to(tl.float32)
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, xmask)
def get_args():
arg_0 = rand_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((28,), (1,), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_silu_84.run(*args, 3584, grid=grid(3584), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_silu_84.benchmark_all_configs(*args, 3584, grid=grid(3584))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/6q/c6qndeyorjgk72ipb7s7xexoyu3w64lakoglh4m45doekpaj4mk5.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_73
triton_poi_fused__to_copy_85 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_85(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 9408
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((336, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((336, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_85.run(*args, 9408, grid=grid(9408), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_85.benchmark_all_configs(*args, 9408, grid=grid(9408))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/e4/ce4gppgjp566e4o7bpfsqwx6ehxfdqd5kcqjzpnjkl7lictvy34j.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_72
# aten.convolution => convolution_28
triton_poi_fused__to_copy_convolution_86 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[512], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_86(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 336
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_86.run(*args, 336, grid=grid(336), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_86.benchmark_all_configs(*args, 336, grid=grid(336))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/6n/c6nqtrrohcuy7rc3a6jzkibcrswmbwqy32tjb2t44xhm5lv3c7ca.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_72
# aten.convolution => convolution_28
triton_poi_fused__to_copy_convolution_87 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_87(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 43008
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 336
tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
tmp2 = tmp0 + tmp1
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None)
def get_args():
arg_0 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_87.run(*args, 43008, grid=grid(43008), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_87.benchmark_all_configs(*args, 43008, grid=grid(43008))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/tg/ctg6frnekhgugkbeugokpbetskj6nj7dyov4tmhfv6akngrmr3zz.py
# Original ATen: aten.mul, aten.sigmoid, aten.silu
# aten.mul => mul_105
# aten.sigmoid => sigmoid_7
# aten.silu => convert_element_type_66, convert_element_type_67, mul_103, sigmoid_5
triton_poi_fused_mul_sigmoid_silu_88 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused_mul_sigmoid_silu_88(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 33718272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x1 = (xindex // 784)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tmp6 = tl.sigmoid(tmp5)
tmp7 = tmp4 * tmp6
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None)
def get_args():
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_mul_sigmoid_silu_88.run(*args, 33718272, grid=grid(33718272), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_mul_sigmoid_silu_88.benchmark_all_configs(*args, 33718272, grid=grid(33718272))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/uj/cujd4wpttqmud7zdg64vlcs4ef2kzahlsvrue6dsbdgqyeobae26.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_84
triton_poi_fused_split_with_sizes_89 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_89(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16859136
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 131712
x1 = (xindex // 131712)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (263424*x1)), None).to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_89.run(*args, 16859136, grid=grid(16859136), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_89.benchmark_all_configs(*args, 16859136, grid=grid(16859136))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/f3/cf3fhfcwg6xxca6ynhdijefpawpp34qudotfwxc7l4ytcq3zjpuo.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_85
triton_poi_fused_split_with_sizes_90 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_90(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16859136
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 131712
x1 = (xindex // 131712)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (131712 + x0 + (263424*x1)), None).to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_90.run(*args, 16859136, grid=grid(16859136), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_90.benchmark_all_configs(*args, 16859136, grid=grid(16859136))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/mu/cmu55cboklppuar2bfoeffop2b46e55c3whe4q3ppvxcrz2secws.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_74
triton_poi_fused__to_copy_91 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_91(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4704
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_91.run(*args, 4704, grid=grid(4704), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_91.benchmark_all_configs(*args, 4704, grid=grid(4704))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/p7/cp7uk2n6iplc4okgdyrcwjh6klja4hwa7hip53lwkxv2iafhns22.py
# Original ATen: aten.cat
# aten.cat => cat_8
triton_poi_fused_cat_92 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_92(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2809856
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 21952
x1 = (xindex // 21952)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (43904*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 28, 28, 28), (21952, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 28, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_92.run(*args, 2809856, grid=grid(2809856), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_92.benchmark_all_configs(*args, 2809856, grid=grid(2809856))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/go/cgo47t53ri3q3acehvndov22nzujsisq6mopaip7tcxdzexe45pe.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add
# aten._native_batch_norm_legit_functional => add_73, add_76, convert_element_type_76, convert_element_type_77, mul_106, mul_112, rsqrt_14, sub_14, var_mean_14
# aten.add => add_77
triton_poi_fused__native_batch_norm_legit_functional_add_93 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_93(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 5619712
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 784) % 56
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp16 = tl.load(in_ptr5 + (x3), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 100352.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp17 = tmp15 + tmp16
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp17, None)
def get_args():
arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_93.run(*args, 5619712, grid=grid(5619712), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_93.benchmark_all_configs(*args, 5619712, grid=grid(5619712))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/cm/ccmet4jdw3a22ehivtutado4ilowri7xe4wvz55tt3jrgkttrcxr.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_122
triton_poi_fused__to_copy_94 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_94(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 18816
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((336, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((336, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_94.run(*args, 18816, grid=grid(18816), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_94.benchmark_all_configs(*args, 18816, grid=grid(18816))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/2z/c2zspsupbpoyi6v25rww6qizjecactzymkuzhmgwbksaajrrvebj.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_127
triton_poi_fused__to_copy_95 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_95(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1008
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((112, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((112, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_95.run(*args, 1008, grid=grid(1008), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_95.benchmark_all_configs(*args, 1008, grid=grid(1008))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/vt/cvthhjpoxq63zz4mggqx2vqnhtco6vgp67jmycwwem5t4nuguc3d.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_125
triton_poi_fused_split_with_sizes_96 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_96(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 11239424
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 87808
x1 = (xindex // 87808)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (263424*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 112, 28, 28), (87808, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_96.run(*args, 11239424, grid=grid(11239424), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_96.benchmark_all_configs(*args, 11239424, grid=grid(11239424))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/67/c67f43otxwovlhoxcmgghbvuamebcise6w4hyt6kxev6rjigyz6t.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_128
triton_poi_fused__to_copy_97 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_97(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2800
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((112, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((112, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_97.run(*args, 2800, grid=grid(2800), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_97.benchmark_all_configs(*args, 2800, grid=grid(2800))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/mp/cmpup3qxitd5cnywi6sc5ctsik3jls3ax4u6bwwg7l22muv3kqac.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_129
triton_poi_fused_split_with_sizes_98 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_98(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 11239424
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 87808
x1 = (xindex // 87808)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (87808 + x0 + (263424*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 112, 28, 28), (87808, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_98.run(*args, 11239424, grid=grid(11239424), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_98.benchmark_all_configs(*args, 11239424, grid=grid(11239424))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/hj/chjntoqpyuayr43gwoduaxa3dcaerfocp4evoj27g5z63hwjt7yd.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_129
triton_poi_fused__to_copy_99 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_99(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 5488
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((112, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((112, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_99.run(*args, 5488, grid=grid(5488), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_99.benchmark_all_configs(*args, 5488, grid=grid(5488))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/5y/c5yntu5zez2ni5e2qpl6ynioug5zthjmijm3yvywigtt4blzf6l6.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_133
triton_poi_fused_split_with_sizes_100 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_100(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 11239424
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 87808
x1 = (xindex // 87808)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (175616 + x0 + (263424*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 112, 28, 28), (87808, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_100.run(*args, 11239424, grid=grid(11239424), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_100.benchmark_all_configs(*args, 11239424, grid=grid(11239424))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/hi/chiefsb3dple2qr2zd5ph5efd2rhpwu722ib7qhnwdnwgxusl6o7.py
# Original ATen: aten.cat
# aten.cat => cat_15
triton_poi_fused_cat_101 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_101(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2809856
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 21952
x1 = (xindex // 21952)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (65856*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 112, 14, 14), (21952, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 112, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_101.run(*args, 2809856, grid=grid(2809856), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_101.benchmark_all_configs(*args, 2809856, grid=grid(2809856))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/hj/chj56tyq633bknygfo23skutbtotcilbxvxls3o3esvqkeosnqzu.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_116, add_117, add_118, convert_element_type_130, mul_172, mul_173, mul_174, mul_175, mul_176, rsqrt_22, squeeze_67, var_mean_22
triton_red_fused__native_batch_norm_legit_functional_102 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 32768],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_102(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 336
rnumel = 25088
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 196
r2 = (rindex // 196)
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (65856*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tmp7 = tl.load(in_ptr1 + (x0), xmask)
tmp3 = 25088.0
tmp4 = tmp2 / tmp3
tmp5 = 0.1
tmp6 = tmp4 * tmp5
tmp8 = 0.9
tmp9 = tmp7 * tmp8
tmp10 = tmp6 + tmp9
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 196
r2 = (rindex // 196)
tmp11 = tl.load(in_ptr0 + (r1 + (196*x0) + (65856*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp12 = tmp11.to(tl.float32)
tmp13 = tmp12 - tmp4
tmp14 = tmp13 * tmp13
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
tmp15 = tl.sum(_tmp15, 1)[:, None]
tl.store(out_ptr1 + x0, tmp15, xmask)
tmp25 = tl.load(in_ptr2 + (x0), xmask)
tmp16 = 25088.0
tmp17 = tmp15 / tmp16
tmp18 = 1e-05
tmp19 = tmp17 + tmp18
tmp20 = tl.math.rsqrt(tmp19)
tmp21 = 1.0000398612827361
tmp22 = tmp17 * tmp21
tmp23 = 0.1
tmp24 = tmp22 * tmp23
tmp26 = 0.9
tmp27 = tmp25 * tmp26
tmp28 = tmp24 + tmp27
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)
def get_args():
arg_0 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_102.run(*args, 336, 25088, grid=grid(336), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_102.benchmark_all_configs(*args, 336, 25088, grid=grid(336))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/zj/czji7shyj4kk6bl2rcdf5euq7ku4vpm3paub7sdz5rtqhc3o2xxw.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu
# aten._native_batch_norm_legit_functional => add_116, add_119, convert_element_type_130, convert_element_type_131, mul_171, mul_177, rsqrt_22, sub_22, var_mean_22
# aten.mean => mean_4
# aten.silu => convert_element_type_132, convert_element_type_133, mul_178, sigmoid_17
triton_per_fused__native_batch_norm_legit_functional_mean_silu_103 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[65536, 256],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_mean_silu_103(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 43008
rnumel = 196
RBLOCK: tl.constexpr = 256
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r2 = rindex
x3 = xindex
x0 = xindex % 336
tmp0 = tl.load(in_ptr0 + (r2 + (196*x3)), rmask, other=0).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x0), None)
tmp4 = tl.load(in_ptr2 + (x0), None)
tmp11 = tl.load(in_ptr3 + (x0), None)
tmp13 = tl.load(in_ptr4 + (x0), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 25088.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tmp15.to(tl.float32)
tmp17 = tl.sigmoid(tmp16)
tmp18 = tmp16 * tmp17
tmp19 = tmp18.to(tl.float32)
tmp20 = tmp19.to(tl.float32)
tmp22 = tl.where(rmask, tmp20, 0)
tmp23 = tl.sum(tmp22, 1)[:, None]
tmp24 = 196.0
tmp25 = tmp23 / tmp24
tmp26 = tmp25.to(tl.float32)
tl.store(out_ptr0 + (r2 + (196*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask)
tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None)
def get_args():
arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_mean_silu_103.run(*args, 43008, 196, grid=grid(43008), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_mean_silu_103.benchmark_all_configs(*args, 43008, 196, grid=grid(43008))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/vd/cvdkhtrpiolmsts2zro46mftpapkihzk36p6zhnrps3p4cq3krmr.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_135
triton_poi_fused__to_copy_104 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_104(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4704
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((14, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((14, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_104.run(*args, 4704, grid=grid(4704), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_104.benchmark_all_configs(*args, 4704, grid=grid(4704))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/tr/ctr74i6h7fbsohivhhoxxpx2ftnqmpxexofhzvpjxs3qwsbp6lt2.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_134
# aten.convolution => convolution_51
triton_poi_fused__to_copy_convolution_105 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_105(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 14
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((14,), (1,), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((14,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_105.run(*args, 14, grid=grid(14), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_105.benchmark_all_configs(*args, 14, grid=grid(14))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ko/ckojvusnljgriux3ampytpmwhilvjdtesa7avk7kubgd6p562p34.py
# Original ATen: aten._to_copy, aten.convolution, aten.silu
# aten._to_copy => convert_element_type_134
# aten.convolution => convolution_51
# aten.silu => convert_element_type_136, convert_element_type_137, mul_179, sigmoid_18
triton_poi_fused__to_copy_convolution_silu_106 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_silu_106(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1792
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 14
tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp2 = tmp0 + tmp1
tmp3 = tmp2.to(tl.float32)
tmp4 = tl.sigmoid(tmp3)
tmp5 = tmp3 * tmp4
tmp6 = tmp5.to(tl.float32)
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, xmask)
def get_args():
arg_0 = rand_strided((128, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((14,), (1,), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_silu_106.run(*args, 1792, grid=grid(1792), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_silu_106.benchmark_all_configs(*args, 1792, grid=grid(1792))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/wd/cwdr2zuaja73on5ahrizfe4fw5rmvg66qvnjezuugxyt6mvgktuo.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_139
triton_poi_fused__to_copy_107 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_107(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4704
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((336, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((336, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_107.run(*args, 4704, grid=grid(4704), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_107.benchmark_all_configs(*args, 4704, grid=grid(4704))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/i4/ci4cud6xszhpzrrtirjqfbkilqlzpxjq4f4rcbjmyxesmegdihcj.py
# Original ATen: aten.mul, aten.sigmoid, aten.silu
# aten.mul => mul_180
# aten.sigmoid => sigmoid_19
# aten.silu => convert_element_type_132, convert_element_type_133, mul_178, sigmoid_17
triton_poi_fused_mul_sigmoid_silu_108 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused_mul_sigmoid_silu_108(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 8429568
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x1 = (xindex // 196)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tmp6 = tl.sigmoid(tmp5)
tmp7 = tmp4 * tmp6
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None)
def get_args():
arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_mul_sigmoid_silu_108.run(*args, 8429568, grid=grid(8429568), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_mul_sigmoid_silu_108.benchmark_all_configs(*args, 8429568, grid=grid(8429568))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/am/camchutakzc7ufjmewydlcbxzwspii4jktvvytxzvdtkijmunoki.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_140
triton_poi_fused__to_copy_109 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_109(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 34944
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((104, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((104, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_109.run(*args, 34944, grid=grid(34944), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_109.benchmark_all_configs(*args, 34944, grid=grid(34944))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/yx/cyxdjlki3rk4s5ktbskifaxqivcqtfxhzou2fgeig544h2budbbn.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_141, var_mean_23
triton_red_fused__native_batch_norm_legit_functional_110 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_110(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 416
rnumel = 6272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex % 104
x1 = (xindex // 104)
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = tl.load(in_ptr0 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tl.store(out_ptr0 + x3, tmp2, xmask)
def get_args():
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 104, 1, 1, 4), (416, 1, 416, 416, 104), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_110.run(*args, 416, 6272, grid=grid(416), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_110.benchmark_all_configs(*args, 416, 6272, grid=grid(416))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ns/cnsyz4u2t4xdpizytxz7kuybbnglpm544k5kmwi5e6mw37eoeeyp.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_122, convert_element_type_141, mul_182, mul_183, var_mean_23
triton_per_fused__native_batch_norm_legit_functional_111 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[128, 4],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_111(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 104
rnumel = 4
RBLOCK: tl.constexpr = 4
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (104*r1)), rmask & xmask, other=0)
tmp8 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 25088.0
tmp5 = tmp3 / tmp4
tmp6 = 0.1
tmp7 = tmp5 * tmp6
tmp9 = 0.9
tmp10 = tmp8 * tmp9
tmp11 = tmp7 + tmp10
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask)
def get_args():
arg_0 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((1, 104, 1, 1, 4), (416, 1, 416, 416, 104), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_111.run(*args, 104, 4, grid=grid(104), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_111.benchmark_all_configs(*args, 104, 4, grid=grid(104))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/vx/cvx6izhz5yqxyjo6cfzkguw47brvifvvxn3jp6fmmqpamdr6rdo7.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_141, var_mean_23
triton_red_fused__native_batch_norm_legit_functional_112 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_112(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 416
rnumel = 6272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex % 104
x1 = (xindex // 104)
tmp2 = tl.load(in_ptr1 + (x0), xmask)
_tmp5 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = tl.load(in_ptr0 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp4 = tmp3 * tmp3
_tmp5 = tl.where(rmask & xmask, _tmp5 + tmp4, _tmp5)
tmp5 = tl.sum(_tmp5, 1)[:, None]
tl.store(out_ptr0 + x3, tmp5, xmask)
def get_args():
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 104, 1, 1, 4), (416, 1, 416, 416, 104), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_112.run(*args, 416, 6272, grid=grid(416), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_112.benchmark_all_configs(*args, 416, 6272, grid=grid(416))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/cy/ccy7dhumtl6nx7vjrgetby6qbjqdcsh4fen2e264olgvvilit6pl.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_121, add_123, convert_element_type_141, mul_184, mul_185, mul_186, rsqrt_23, squeeze_70, var_mean_23
triton_per_fused__native_batch_norm_legit_functional_113 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[128, 4],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_113(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 104
rnumel = 4
RBLOCK: tl.constexpr = 4
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (104*r1)), rmask & xmask, other=0)
tmp13 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 25088.0
tmp5 = tmp3 / tmp4
tmp6 = 1e-05
tmp7 = tmp5 + tmp6
tmp8 = tl.math.rsqrt(tmp7)
tmp9 = 1.0000398612827361
tmp10 = tmp5 * tmp9
tmp11 = 0.1
tmp12 = tmp10 * tmp11
tmp14 = 0.9
tmp15 = tmp13 * tmp14
tmp16 = tmp12 + tmp15
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask)
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask)
tl.store(out_ptr0 + x0, tmp3, xmask)
def get_args():
arg_0 = rand_strided((1, 104, 1, 1, 4), (416, 1, 416, 416, 104), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_113.run(*args, 104, 4, grid=grid(104), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_113.benchmark_all_configs(*args, 104, 4, grid=grid(104))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/xg/cxgqusw3xzpsm5ih6qyxh7qdzmvcnjj2evihvlzncrorcyi7altv.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_121, add_124, convert_element_type_141, convert_element_type_142, mul_181, mul_187, rsqrt_23, sub_23, var_mean_23
triton_poi_fused__native_batch_norm_legit_functional_114 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_114(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2609152
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 196) % 104
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 25088.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None)
def get_args():
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_114.run(*args, 2609152, grid=grid(2609152), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_114.benchmark_all_configs(*args, 2609152, grid=grid(2609152))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/hr/chrygikuwep25dgozsipotfnih6lajyye2txuo4r7put7ywkdk3t.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_143
triton_poi_fused__to_copy_115 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_115(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16224
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_115.run(*args, 16224, grid=grid(16224), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_115.benchmark_all_configs(*args, 16224, grid=grid(16224))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ca/ccaahgmwlqiraz4ivxpktecmbqhvntosz4cdfe5ghfjm4pe6pws6.py
# Original ATen: aten.cat
# aten.cat => cat_16
triton_poi_fused_cat_116 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_116(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 7827456
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 61152
x1 = (xindex // 61152)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (122304*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 312, 14, 14), (61152, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 312, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_116.run(*args, 7827456, grid=grid(7827456), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_116.benchmark_all_configs(*args, 7827456, grid=grid(7827456))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/j3/cj3tf4apq2n25yzonidfbwyd3cn3jiyqodrw5hjo73xovhn5czvu.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_126, add_127, add_128, convert_element_type_145, mul_189, mul_190, mul_191, mul_192, mul_193, rsqrt_24, squeeze_73, var_mean_24
triton_red_fused__native_batch_norm_legit_functional_117 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 32768],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_117(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 624
rnumel = 25088
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 196
r2 = (rindex // 196)
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tmp7 = tl.load(in_ptr1 + (x0), xmask)
tmp3 = 25088.0
tmp4 = tmp2 / tmp3
tmp5 = 0.1
tmp6 = tmp4 * tmp5
tmp8 = 0.9
tmp9 = tmp7 * tmp8
tmp10 = tmp6 + tmp9
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 196
r2 = (rindex // 196)
tmp11 = tl.load(in_ptr0 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp12 = tmp11.to(tl.float32)
tmp13 = tmp12 - tmp4
tmp14 = tmp13 * tmp13
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
tmp15 = tl.sum(_tmp15, 1)[:, None]
tl.store(out_ptr1 + x0, tmp15, xmask)
tmp25 = tl.load(in_ptr2 + (x0), xmask)
tmp16 = 25088.0
tmp17 = tmp15 / tmp16
tmp18 = 1e-05
tmp19 = tmp17 + tmp18
tmp20 = tl.math.rsqrt(tmp19)
tmp21 = 1.0000398612827361
tmp22 = tmp17 * tmp21
tmp23 = 0.1
tmp24 = tmp22 * tmp23
tmp26 = 0.9
tmp27 = tmp25 * tmp26
tmp28 = tmp24 + tmp27
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)
def get_args():
arg_0 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_117.run(*args, 624, 25088, grid=grid(624), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_117.benchmark_all_configs(*args, 624, 25088, grid=grid(624))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/yk/cyk2njvgthyqkaax3hrogqqjnq7t4z4h2bdza6wc2nyqhlzwiqur.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub
# aten._native_batch_norm_legit_functional => add_126, add_129, convert_element_type_145, convert_element_type_146, mul_188, mul_194, rsqrt_24, sub_24, var_mean_24
# aten.add => add_355
# aten.clone => clone_15
# aten.fill => full_like_32
# aten.mul => mul_907, mul_908
# aten.sigmoid => sigmoid_96
# aten.sub => sub_233
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_118 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_118(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
xnumel = 15654912
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 196) % 624
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 25088.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tl.sigmoid(tmp15)
tmp17 = 1.0
tmp18 = tmp17 - tmp16
tmp19 = tmp15 * tmp18
tmp20 = tmp19 + tmp17
tmp21 = tmp16 * tmp20
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None)
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp21, None)
def get_args():
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_118.run(*args, 15654912, grid=grid(15654912), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_118.benchmark_all_configs(*args, 15654912, grid=grid(15654912))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/h4/ch4fzflvelcyupsdemcjv4coogzqeqjolfo3rcegkby4xe2z7cgp.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_149
triton_poi_fused__to_copy_119 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_119(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1404
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((156, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((156, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_119.run(*args, 1404, grid=grid(1404), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_119.benchmark_all_configs(*args, 1404, grid=grid(1404))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ee/ceehri7sldlkmfyc53ikeryr64g77qkjawr2ddxdwntxstm4glbe.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_146
triton_poi_fused_split_with_sizes_120 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_120(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3913728
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 30576
x1 = (xindex // 30576)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (122304*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_120.run(*args, 3913728, grid=grid(3913728), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_120.benchmark_all_configs(*args, 3913728, grid=grid(3913728))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/3q/c3qy34jaqk4nnzsuou7s5dwcbuoi5kctwalhcspkfthqj2nvhjzq.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_150
triton_poi_fused__to_copy_121 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_121(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3900
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((156, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((156, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_121.run(*args, 3900, grid=grid(3900), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_121.benchmark_all_configs(*args, 3900, grid=grid(3900))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/d2/cd2noiyayhlfce4vrflvnr5n745imwlcczcywnnle3qtso27avlv.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_151
triton_poi_fused_split_with_sizes_122 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_122(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3913728
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 30576
x1 = (xindex // 30576)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (30576 + x0 + (122304*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_122.run(*args, 3913728, grid=grid(3913728), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_122.benchmark_all_configs(*args, 3913728, grid=grid(3913728))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/on/conrh3mxhtsmg5nvtksxdme55y6bhfvwrrnpp6mhmwmulosn5uyv.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_151
triton_poi_fused__to_copy_123 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_123(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 7644
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((156, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((156, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_123.run(*args, 7644, grid=grid(7644), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_123.benchmark_all_configs(*args, 7644, grid=grid(7644))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/nc/cncwy3tqtyzgd3muyfflcfni7qls4ttqyb4sac6cmg6znll2bcqu.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_156
triton_poi_fused_split_with_sizes_124 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_124(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3913728
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 30576
x1 = (xindex // 30576)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (61152 + x0 + (122304*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_124.run(*args, 3913728, grid=grid(3913728), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_124.benchmark_all_configs(*args, 3913728, grid=grid(3913728))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/yy/cyyxm3mybqiv5z5gofz4uhjttc6in5hwbrcbzt4xlp4z5lqyiqvr.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_152
triton_poi_fused__to_copy_125 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_125(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 12636
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((156, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((156, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_125.run(*args, 12636, grid=grid(12636), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_125.benchmark_all_configs(*args, 12636, grid=grid(12636))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ta/ctar2nhvp2hnx6l3xel6gxwicmmw5nlhbt33l63xfadahaihh7qo.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_161
triton_poi_fused_split_with_sizes_126 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_126(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3913728
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 30576
x1 = (xindex // 30576)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (91728 + x0 + (122304*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_126.run(*args, 3913728, grid=grid(3913728), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_126.benchmark_all_configs(*args, 3913728, grid=grid(3913728))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/aa/caaej5ipewhg7u3hng5mw3omnmn72k6eam7aoq4mbk5yqdeq7wwb.py
# Original ATen: aten.cat
# aten.cat => cat_17
triton_poi_fused_cat_127 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_127(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3913728
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 30576
x1 = (xindex // 30576)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (122304*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 156, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_127.run(*args, 3913728, grid=grid(3913728), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_127.benchmark_all_configs(*args, 3913728, grid=grid(3913728))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/xm/cxmjpqggq2cokpfycochmecokolhxsamegk6vtmem3x47oz5ss3r.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu
# aten._native_batch_norm_legit_functional => add_131, add_134, convert_element_type_153, convert_element_type_154, mul_196, mul_202, rsqrt_25, sub_25, var_mean_25
# aten.mean => mean_5
# aten.silu => convert_element_type_155, convert_element_type_156, mul_203, sigmoid_21
triton_per_fused__native_batch_norm_legit_functional_mean_silu_128 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[131072, 256],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_mean_silu_128(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 79872
rnumel = 196
RBLOCK: tl.constexpr = 256
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r2 = rindex
x3 = xindex
x0 = xindex % 624
tmp0 = tl.load(in_ptr0 + (r2 + (196*x3)), rmask, other=0).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x0), None)
tmp4 = tl.load(in_ptr2 + (x0), None)
tmp11 = tl.load(in_ptr3 + (x0), None)
tmp13 = tl.load(in_ptr4 + (x0), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 25088.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tmp15.to(tl.float32)
tmp17 = tl.sigmoid(tmp16)
tmp18 = tmp16 * tmp17
tmp19 = tmp18.to(tl.float32)
tmp20 = tmp19.to(tl.float32)
tmp22 = tl.where(rmask, tmp20, 0)
tmp23 = tl.sum(tmp22, 1)[:, None]
tmp24 = 196.0
tmp25 = tmp23 / tmp24
tmp26 = tmp25.to(tl.float32)
tl.store(out_ptr0 + (r2 + (196*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask)
tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None)
def get_args():
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_mean_silu_128.run(*args, 79872, 196, grid=grid(79872), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_mean_silu_128.benchmark_all_configs(*args, 79872, 196, grid=grid(79872))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/5n/c5nuz3wivd2kux327s5ffdifgf3i65ukistqypmzli4ipcs5fhup.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_158
triton_poi_fused__to_copy_129 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_129(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16224
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((26, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((26, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_129.run(*args, 16224, grid=grid(16224), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_129.benchmark_all_configs(*args, 16224, grid=grid(16224))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/7u/c7uwxkw3ipfjfdehvzykp4imfsjz53vfoikfvt4ys7uwsknqbbb2.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_157
# aten.convolution => convolution_60
triton_poi_fused__to_copy_convolution_130 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_130(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 26
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((26,), (1,), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((26,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_130.run(*args, 26, grid=grid(26), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_130.benchmark_all_configs(*args, 26, grid=grid(26))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/qj/cqjs4oas7ss654pge5hzaweuxapq6ge5y5fvo2ygxf7r6qylld6m.py
# Original ATen: aten._to_copy, aten.convolution, aten.silu
# aten._to_copy => convert_element_type_157
# aten.convolution => convolution_60
# aten.silu => convert_element_type_159, convert_element_type_160, mul_204, sigmoid_22
triton_poi_fused__to_copy_convolution_silu_131 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_silu_131(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3328
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 26
tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp2 = tmp0 + tmp1
tmp3 = tmp2.to(tl.float32)
tmp4 = tl.sigmoid(tmp3)
tmp5 = tmp3 * tmp4
tmp6 = tmp5.to(tl.float32)
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, xmask)
def get_args():
arg_0 = rand_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((26,), (1,), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_silu_131.run(*args, 3328, grid=grid(3328), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_silu_131.benchmark_all_configs(*args, 3328, grid=grid(3328))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/af/cafnwdqexuqbr3u7uonjmdp2h6gms65ppr5ersr7levxxzbxsags.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_162
triton_poi_fused__to_copy_132 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_132(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16224
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((624, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((624, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_132.run(*args, 16224, grid=grid(16224), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_132.benchmark_all_configs(*args, 16224, grid=grid(16224))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/qt/cqtzzxfgsd6elacosxssusc2qroqxqbum3szv62gh4gr6s5vzwmp.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_161
# aten.convolution => convolution_61
triton_poi_fused__to_copy_convolution_133 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_133(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 624
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_133.run(*args, 624, grid=grid(624), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_133.benchmark_all_configs(*args, 624, grid=grid(624))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/hd/chd7xmae5qozjrly7porkx7hggrm6fh6klnsko3rzc52ohjqiyou.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_161
# aten.convolution => convolution_61
triton_poi_fused__to_copy_convolution_134 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_134(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 79872
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 624
tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
tmp2 = tmp0 + tmp1
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None)
def get_args():
arg_0 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_134.run(*args, 79872, grid=grid(79872), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_134.benchmark_all_configs(*args, 79872, grid=grid(79872))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/mn/cmnvsfbxocmtd6hcxhi5n75ltfyzsnegxzpgv7o57n6ksfhmrkay.py
# Original ATen: aten.mul, aten.sigmoid, aten.silu
# aten.mul => mul_205
# aten.sigmoid => sigmoid_23
# aten.silu => convert_element_type_155, convert_element_type_156, mul_203, sigmoid_21
triton_poi_fused_mul_sigmoid_silu_135 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused_mul_sigmoid_silu_135(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 15654912
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x1 = (xindex // 196)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tmp6 = tl.sigmoid(tmp5)
tmp7 = tmp4 * tmp6
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None)
def get_args():
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_mul_sigmoid_silu_135.run(*args, 15654912, grid=grid(15654912), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_mul_sigmoid_silu_135.benchmark_all_configs(*args, 15654912, grid=grid(15654912))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/x6/cx65e4iuztiipxmj3kt6pdmtbqk6eud6jbchpllpc6tinjqjiko7.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_164
triton_poi_fused_split_with_sizes_136 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_136(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 7827456
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 61152
x1 = (xindex // 61152)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (122304*x1)), None).to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 312, 14, 14), (61152, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_136.run(*args, 7827456, grid=grid(7827456), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_136.benchmark_all_configs(*args, 7827456, grid=grid(7827456))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/la/cla6pjsd7zhhqofhjak5ak67vzu4puphrosszybe7he2qayusdf7.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_165
triton_poi_fused_split_with_sizes_137 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_137(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 7827456
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 61152
x1 = (xindex // 61152)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (61152 + x0 + (122304*x1)), None).to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 312, 14, 14), (61152, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_137.run(*args, 7827456, grid=grid(7827456), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_137.benchmark_all_configs(*args, 7827456, grid=grid(7827456))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/hz/chz3fw3o5fqe2cnzuiuggnyrnh3xzhhrw266gsgpwo3xgo4buev6.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_163
triton_poi_fused__to_copy_138 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_138(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16224
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_138.run(*args, 16224, grid=grid(16224), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_138.benchmark_all_configs(*args, 16224, grid=grid(16224))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/vk/cvkwpn3gas36vom5boko6m7yjnsu4h3jn3s2dj74g4puyptio4dc.py
# Original ATen: aten.cat
# aten.cat => cat_18
triton_poi_fused_cat_139 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_139(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1304576
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 10192
x1 = (xindex // 10192)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (20384*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 52, 14, 14), (10192, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 52, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_139.run(*args, 1304576, grid=grid(1304576), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_139.benchmark_all_configs(*args, 1304576, grid=grid(1304576))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/qp/cqptss63kr7w22hqbaitci52teddi4haqxu6zws6bzlk7k3r4p5w.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add
# aten._native_batch_norm_legit_functional => add_136, add_139, convert_element_type_165, convert_element_type_166, mul_206, mul_212, rsqrt_26, sub_26, var_mean_26
# aten.add => add_140
triton_poi_fused__native_batch_norm_legit_functional_add_140 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_140(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2609152
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 196) % 104
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp16 = tl.load(in_ptr5 + (x3), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 25088.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp17 = tmp15 + tmp16
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp17, None)
def get_args():
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_140.run(*args, 2609152, grid=grid(2609152), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_140.benchmark_all_configs(*args, 2609152, grid=grid(2609152))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/zf/czfuu6dquzuudpwuu3bwtsd76hxju2ps43qrqh5npmmaayzjihi2.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_215
triton_poi_fused__to_copy_141 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_141(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 64896
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((624, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((624, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_141.run(*args, 64896, grid=grid(64896), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_141.benchmark_all_configs(*args, 64896, grid=grid(64896))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/hj/chjvw2ozl3ipjwmejxog5ggykkaa6pfst7j432f3sbhnt2kvth4p.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.silu, aten.sub
# aten._native_batch_norm_legit_functional => add_174, add_177, convert_element_type_216, convert_element_type_217, mul_263, mul_269, rsqrt_33, sub_33, var_mean_33
# aten.add => add_341
# aten.clone => clone_24
# aten.fill => full_like_23
# aten.mul => mul_787, mul_788
# aten.sigmoid => sigmoid_87
# aten.silu => convert_element_type_218, convert_element_type_219, mul_270, sigmoid_32
# aten.sub => sub_185
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_silu_sub_142 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_silu_sub_142(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr1, out_ptr2, xnumel, XBLOCK : tl.constexpr):
xnumel = 15654912
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 196) % 624
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 25088.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tmp15.to(tl.float32)
tmp17 = tl.sigmoid(tmp16)
tmp18 = tmp16 * tmp17
tmp19 = tmp18.to(tl.float32)
tmp20 = tl.sigmoid(tmp15)
tmp21 = 1.0
tmp22 = tmp21 - tmp20
tmp23 = tmp15 * tmp22
tmp24 = tmp23 + tmp21
tmp25 = tmp20 * tmp24
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp19, None)
tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp25, None)
def get_args():
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_silu_sub_142.run(*args, 15654912, grid=grid(15654912), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_silu_sub_142.benchmark_all_configs(*args, 15654912, grid=grid(15654912))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/by/cbyy7pyyeky2s6spdugmwrwqdvrjfdr7qwcxwapwzhywwhng5ujh.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_220
triton_poi_fused__to_copy_143 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_143(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 5616
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((624, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((624, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_143.run(*args, 5616, grid=grid(5616), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_143.benchmark_all_configs(*args, 5616, grid=grid(5616))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ji/cjiljcgh2wc2yighvqz76tkwzti3zjjxpno5dng2ammsgvfhwvv7.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_226
triton_poi_fused__to_copy_144 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_144(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 32448
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((52, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((52, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_144.run(*args, 32448, grid=grid(32448), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_144.benchmark_all_configs(*args, 32448, grid=grid(32448))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/tv/ctvzn4xpxjd5g7oyd7nntvpnuyovmntivt2fyxf53c3mblh6dshr.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_225
# aten.convolution => convolution_86
triton_poi_fused__to_copy_convolution_145 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[64], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_145(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 52
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((52,), (1,), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((52,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_145.run(*args, 52, grid=grid(52), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_145.benchmark_all_configs(*args, 52, grid=grid(52))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/sf/csfcbn5kzqvvv6krkr7772b7z66pr6xsuxyhtbgglpb26gkfc5h3.py
# Original ATen: aten._to_copy, aten.convolution, aten.silu
# aten._to_copy => convert_element_type_225
# aten.convolution => convolution_86
# aten.silu => convert_element_type_227, convert_element_type_228, mul_279, sigmoid_34
triton_poi_fused__to_copy_convolution_silu_146 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_silu_146(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 6656
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 52
tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp2 = tmp0 + tmp1
tmp3 = tmp2.to(tl.float32)
tmp4 = tl.sigmoid(tmp3)
tmp5 = tmp3 * tmp4
tmp6 = tmp5.to(tl.float32)
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, xmask)
def get_args():
arg_0 = rand_strided((128, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((52,), (1,), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_silu_146.run(*args, 6656, grid=grid(6656), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_silu_146.benchmark_all_configs(*args, 6656, grid=grid(6656))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ik/cikbdagjlpyedk3af4it4ary3yx6g4gpfifz25vxc7r25q224d43.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_230
triton_poi_fused__to_copy_147 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_147(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 32448
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((624, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((624, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_147.run(*args, 32448, grid=grid(32448), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_147.benchmark_all_configs(*args, 32448, grid=grid(32448))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/xb/cxb5poqew2wpvurnixpesahtvwzfiadakefnqud2hzokvcgnemog.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_231
triton_poi_fused__to_copy_148 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_148(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 99840
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((160, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((160, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_148.run(*args, 99840, grid=grid(99840), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_148.benchmark_all_configs(*args, 99840, grid=grid(99840))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/z7/cz7tf27fdbr7sfmkjktkou2imnr2mnlt7hicdhvs4chclhrw6j37.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_232, var_mean_35
triton_red_fused__native_batch_norm_legit_functional_149 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_149(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 640
rnumel = 6272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex % 160
x1 = (xindex // 160)
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = tl.load(in_ptr0 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tl.store(out_ptr0 + x3, tmp2, xmask)
def get_args():
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 160, 1, 1, 4), (640, 1, 640, 640, 160), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_149.run(*args, 640, 6272, grid=grid(640), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_149.benchmark_all_configs(*args, 640, 6272, grid=grid(640))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/hi/chiuub4omrsfimb4p75xk4vlbwv5mhljjinvhtfc3kqhmfnsyq4q.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_185, convert_element_type_232, mul_282, mul_283, var_mean_35
triton_per_fused__native_batch_norm_legit_functional_150 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[256, 4],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_150(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 160
rnumel = 4
RBLOCK: tl.constexpr = 4
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (160*r1)), rmask & xmask, other=0)
tmp8 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 25088.0
tmp5 = tmp3 / tmp4
tmp6 = 0.1
tmp7 = tmp5 * tmp6
tmp9 = 0.9
tmp10 = tmp8 * tmp9
tmp11 = tmp7 + tmp10
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask)
def get_args():
arg_0 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((1, 160, 1, 1, 4), (640, 1, 640, 640, 160), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_150.run(*args, 160, 4, grid=grid(160), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_150.benchmark_all_configs(*args, 160, 4, grid=grid(160))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/4b/c4bvtzglf3eflssbrfazd6mjdgmrhoujx4xq6fve2tfmkwyeseru.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_232, var_mean_35
triton_red_fused__native_batch_norm_legit_functional_151 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_151(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 640
rnumel = 6272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex % 160
x1 = (xindex // 160)
tmp2 = tl.load(in_ptr1 + (x0), xmask)
_tmp5 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = tl.load(in_ptr0 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp4 = tmp3 * tmp3
_tmp5 = tl.where(rmask & xmask, _tmp5 + tmp4, _tmp5)
tmp5 = tl.sum(_tmp5, 1)[:, None]
tl.store(out_ptr0 + x3, tmp5, xmask)
def get_args():
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 160, 1, 1, 4), (640, 1, 640, 640, 160), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_151.run(*args, 640, 6272, grid=grid(640), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_151.benchmark_all_configs(*args, 640, 6272, grid=grid(640))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/vo/cvo4hmhbgzgrpmknfykmtsywf2s2dd4n7xkbf6ll6hxoj5qx3zny.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_184, add_186, convert_element_type_232, mul_284, mul_285, mul_286, rsqrt_35, squeeze_106, var_mean_35
triton_per_fused__native_batch_norm_legit_functional_152 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[256, 4],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_152(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 160
rnumel = 4
RBLOCK: tl.constexpr = 4
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (160*r1)), rmask & xmask, other=0)
tmp13 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 25088.0
tmp5 = tmp3 / tmp4
tmp6 = 1e-05
tmp7 = tmp5 + tmp6
tmp8 = tl.math.rsqrt(tmp7)
tmp9 = 1.0000398612827361
tmp10 = tmp5 * tmp9
tmp11 = 0.1
tmp12 = tmp10 * tmp11
tmp14 = 0.9
tmp15 = tmp13 * tmp14
tmp16 = tmp12 + tmp15
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask)
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask)
tl.store(out_ptr0 + x0, tmp3, xmask)
def get_args():
arg_0 = rand_strided((1, 160, 1, 1, 4), (640, 1, 640, 640, 160), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_152.run(*args, 160, 4, grid=grid(160), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_152.benchmark_all_configs(*args, 160, 4, grid=grid(160))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/7r/c7rznphkbqtbupvosh2bugs6qz6cx4gcbzlpwzvb44okdkqoc7am.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_184, add_187, convert_element_type_232, convert_element_type_233, mul_281, mul_287, rsqrt_35, sub_35, var_mean_35
triton_poi_fused__native_batch_norm_legit_functional_153 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_153(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4014080
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 196) % 160
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 25088.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None)
def get_args():
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_153.run(*args, 4014080, grid=grid(4014080), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_153.benchmark_all_configs(*args, 4014080, grid=grid(4014080))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/25/c25hctynyjlhient6ceczvttfs4npfgnfoty5zhk6pn4ckhqm3ka.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_234
triton_poi_fused__to_copy_154 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_154(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 19200
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_154.run(*args, 19200, grid=grid(19200), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_154.benchmark_all_configs(*args, 19200, grid=grid(19200))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/yz/cyzjinfa2nwvymmtkqoovwkfypkotg6ek2sunwsuf3vfd35r7l3k.py
# Original ATen: aten.cat
# aten.cat => cat_25
triton_poi_fused_cat_155 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_155(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 6021120
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 47040
x1 = (xindex // 47040)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (94080*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 240, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_155.run(*args, 6021120, grid=grid(6021120), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_155.benchmark_all_configs(*args, 6021120, grid=grid(6021120))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ks/cksikkrsggj4vc4g42rphhvyb3ox3tanqawsqpy2rpo5qbqc3tin.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_189, add_190, add_191, convert_element_type_236, mul_289, mul_290, mul_291, mul_292, mul_293, rsqrt_36, squeeze_109, var_mean_36
triton_red_fused__native_batch_norm_legit_functional_156 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 32768],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_156(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 480
rnumel = 25088
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 196
r2 = (rindex // 196)
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tmp7 = tl.load(in_ptr1 + (x0), xmask)
tmp3 = 25088.0
tmp4 = tmp2 / tmp3
tmp5 = 0.1
tmp6 = tmp4 * tmp5
tmp8 = 0.9
tmp9 = tmp7 * tmp8
tmp10 = tmp6 + tmp9
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 196
r2 = (rindex // 196)
tmp11 = tl.load(in_ptr0 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp12 = tmp11.to(tl.float32)
tmp13 = tmp12 - tmp4
tmp14 = tmp13 * tmp13
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
tmp15 = tl.sum(_tmp15, 1)[:, None]
tl.store(out_ptr1 + x0, tmp15, xmask)
tmp25 = tl.load(in_ptr2 + (x0), xmask)
tmp16 = 25088.0
tmp17 = tmp15 / tmp16
tmp18 = 1e-05
tmp19 = tmp17 + tmp18
tmp20 = tl.math.rsqrt(tmp19)
tmp21 = 1.0000398612827361
tmp22 = tmp17 * tmp21
tmp23 = 0.1
tmp24 = tmp22 * tmp23
tmp26 = 0.9
tmp27 = tmp25 * tmp26
tmp28 = tmp24 + tmp27
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)
def get_args():
arg_0 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_156.run(*args, 480, 25088, grid=grid(480), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_156.benchmark_all_configs(*args, 480, 25088, grid=grid(480))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/uq/cuq6a4x4agmkmfpm3znliwswlpgc26f3bnjc5ylnl5l6aeuugh6f.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub
# aten._native_batch_norm_legit_functional => add_189, add_192, convert_element_type_236, convert_element_type_237, mul_288, mul_294, rsqrt_36, sub_36, var_mean_36
# aten.add => add_336
# aten.clone => clone_27
# aten.fill => full_like_20
# aten.mul => mul_747, mul_748
# aten.sigmoid => sigmoid_84
# aten.sub => sub_169
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_157 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_157(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
xnumel = 12042240
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 196) % 480
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 25088.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tl.sigmoid(tmp15)
tmp17 = 1.0
tmp18 = tmp17 - tmp16
tmp19 = tmp15 * tmp18
tmp20 = tmp19 + tmp17
tmp21 = tmp16 * tmp20
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None)
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp21, None)
def get_args():
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_157.run(*args, 12042240, grid=grid(12042240), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_157.benchmark_all_configs(*args, 12042240, grid=grid(12042240))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/hb/chbvxlmers2jwf43nczkvjub2nxsnhvu5csbgr2u6jwg4rlcdpzl.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_242
triton_poi_fused_split_with_sizes_158 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_158(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3010560
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 23520
x1 = (xindex // 23520)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (94080*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_158.run(*args, 3010560, grid=grid(3010560), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_158.benchmark_all_configs(*args, 3010560, grid=grid(3010560))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/cu/ccuyxkbjwe7ff4mqowk3q7to3cbjg3o7g2sh6doygm5y2bnlvlfi.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_241
triton_poi_fused__to_copy_159 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_159(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3000
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_159.run(*args, 3000, grid=grid(3000), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_159.benchmark_all_configs(*args, 3000, grid=grid(3000))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/dg/cdglxs23pt7h2wls62uy6ewinjtjctcradtpw76t3qcrvnhso4kr.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_247
triton_poi_fused_split_with_sizes_160 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_160(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3010560
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 23520
x1 = (xindex // 23520)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (23520 + x0 + (94080*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_160.run(*args, 3010560, grid=grid(3010560), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_160.benchmark_all_configs(*args, 3010560, grid=grid(3010560))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/v4/cv4uf7ojdb6edtvnqbvbxjvi624yvygrldwzdl6uy2gdzz3sghqp.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_242
triton_poi_fused__to_copy_161 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_161(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 5880
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_161.run(*args, 5880, grid=grid(5880), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_161.benchmark_all_configs(*args, 5880, grid=grid(5880))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/bn/cbnnjypiavc2wmccrce2li6mllqpp2n4naaetozpnboypgpizsqn.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_252
triton_poi_fused_split_with_sizes_162 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_162(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3010560
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 23520
x1 = (xindex // 23520)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (47040 + x0 + (94080*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_162.run(*args, 3010560, grid=grid(3010560), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_162.benchmark_all_configs(*args, 3010560, grid=grid(3010560))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/4m/c4mgwvuur32h35cj53cazh6flbatygb5baks32qmyo4zqc7drw5k.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_243
triton_poi_fused__to_copy_163 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_163(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 9720
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_163.run(*args, 9720, grid=grid(9720), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_163.benchmark_all_configs(*args, 9720, grid=grid(9720))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/4m/c4mepa4uirogjqorspxpl4t5fx2rzmjdzatpwzpg64qrcyuvg7m4.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_257
triton_poi_fused_split_with_sizes_164 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_164(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3010560
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 23520
x1 = (xindex // 23520)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (70560 + x0 + (94080*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_164.run(*args, 3010560, grid=grid(3010560), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_164.benchmark_all_configs(*args, 3010560, grid=grid(3010560))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/qv/cqvo24surldsdqwccrhisp4chcnkg2otiq5qykrtymzijwmytnb7.py
# Original ATen: aten.cat
# aten.cat => cat_26
triton_poi_fused_cat_165 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_165(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3010560
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 23520
x1 = (xindex // 23520)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (94080*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 120, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_165.run(*args, 3010560, grid=grid(3010560), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_165.benchmark_all_configs(*args, 3010560, grid=grid(3010560))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/25/c25wuycbavajx7kwcvwpcba6kxp6j5oofnjkbplrrxckdmxy3shz.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu
# aten._native_batch_norm_legit_functional => add_194, add_197, convert_element_type_244, convert_element_type_245, mul_296, mul_302, rsqrt_37, sub_37, var_mean_37
# aten.mean => mean_9
# aten.silu => convert_element_type_246, convert_element_type_247, mul_303, sigmoid_37
triton_per_fused__native_batch_norm_legit_functional_mean_silu_166 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[65536, 256],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_mean_silu_166(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 61440
rnumel = 196
RBLOCK: tl.constexpr = 256
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r2 = rindex
x3 = xindex
x0 = xindex % 480
tmp0 = tl.load(in_ptr0 + (r2 + (196*x3)), rmask, other=0).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x0), None)
tmp4 = tl.load(in_ptr2 + (x0), None)
tmp11 = tl.load(in_ptr3 + (x0), None)
tmp13 = tl.load(in_ptr4 + (x0), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 25088.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tmp15.to(tl.float32)
tmp17 = tl.sigmoid(tmp16)
tmp18 = tmp16 * tmp17
tmp19 = tmp18.to(tl.float32)
tmp20 = tmp19.to(tl.float32)
tmp22 = tl.where(rmask, tmp20, 0)
tmp23 = tl.sum(tmp22, 1)[:, None]
tmp24 = 196.0
tmp25 = tmp23 / tmp24
tmp26 = tmp25.to(tl.float32)
tl.store(out_ptr0 + (r2 + (196*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask)
tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None)
def get_args():
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_mean_silu_166.run(*args, 61440, 196, grid=grid(61440), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_mean_silu_166.benchmark_all_configs(*args, 61440, 196, grid=grid(61440))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/vh/cvhn4ymz4o7zw5xdmhfhtnvjkm7lfirplpypnuvceox4nfhcy7wo.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_249
triton_poi_fused__to_copy_167 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_167(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 38400
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_167.run(*args, 38400, grid=grid(38400), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_167.benchmark_all_configs(*args, 38400, grid=grid(38400))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/uc/cuc5yc2wu6mlfc426p45ysrpjdvcf67n4fnoghr4zijaa4qc4jub.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_248
# aten.convolution => convolution_95
triton_poi_fused__to_copy_convolution_168 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[128], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_168(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 80
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((80,), (1,), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((80,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_168.run(*args, 80, grid=grid(80), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_168.benchmark_all_configs(*args, 80, grid=grid(80))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/w4/cw4m6y7egnkzpotuw7yanoiegenydq7w76ehn3uqkz73dfsxrsvq.py
# Original ATen: aten._to_copy, aten.convolution, aten.silu
# aten._to_copy => convert_element_type_248
# aten.convolution => convolution_95
# aten.silu => convert_element_type_250, convert_element_type_251, mul_304, sigmoid_38
triton_poi_fused__to_copy_convolution_silu_169 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_silu_169(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 10240
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 80
tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
tmp2 = tmp0 + tmp1
tmp3 = tmp2.to(tl.float32)
tmp4 = tl.sigmoid(tmp3)
tmp5 = tmp3 * tmp4
tmp6 = tmp5.to(tl.float32)
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, None)
def get_args():
arg_0 = rand_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((80,), (1,), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_silu_169.run(*args, 10240, grid=grid(10240), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_silu_169.benchmark_all_configs(*args, 10240, grid=grid(10240))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/5w/c5wu2byqoqpdpilokyxdylwqvzl3vlutvi3qed6ouxeny7v3ma4k.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_253
triton_poi_fused__to_copy_170 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_170(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 38400
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_170.run(*args, 38400, grid=grid(38400), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_170.benchmark_all_configs(*args, 38400, grid=grid(38400))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/t6/ct65ncuugfm3kp7u2z2xcftkozxdb7egjj3feed7rrbxlnc25mxn.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_252
# aten.convolution => convolution_96
triton_poi_fused__to_copy_convolution_171 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[512], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_171(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 480
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_171.run(*args, 480, grid=grid(480), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_171.benchmark_all_configs(*args, 480, grid=grid(480))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/q3/cq36nnpdlb4xwdvwsiktoah4r62ugwqlfeikgt4twhaobbsdqnku.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_252
# aten.convolution => convolution_96
triton_poi_fused__to_copy_convolution_172 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_172(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 61440
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 480
tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
tmp2 = tmp0 + tmp1
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None)
def get_args():
arg_0 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_172.run(*args, 61440, grid=grid(61440), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_172.benchmark_all_configs(*args, 61440, grid=grid(61440))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ce/cce2z44226346eg4knvdzt24sfsujmrpjne4vcofj37miaugwnek.py
# Original ATen: aten.mul, aten.sigmoid, aten.silu
# aten.mul => mul_305
# aten.sigmoid => sigmoid_39
# aten.silu => convert_element_type_246, convert_element_type_247, mul_303, sigmoid_37
triton_poi_fused_mul_sigmoid_silu_173 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused_mul_sigmoid_silu_173(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 12042240
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x1 = (xindex // 196)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tmp6 = tl.sigmoid(tmp5)
tmp7 = tmp4 * tmp6
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None)
def get_args():
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_mul_sigmoid_silu_173.run(*args, 12042240, grid=grid(12042240), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_mul_sigmoid_silu_173.benchmark_all_configs(*args, 12042240, grid=grid(12042240))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/jt/cjtarvyynr3tkfvm37qgtcwaksnjyb62otlxal5l5thrlfpz4yoz.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_260
triton_poi_fused_split_with_sizes_174 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_174(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 6021120
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 47040
x1 = (xindex // 47040)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (94080*x1)), None).to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_174.run(*args, 6021120, grid=grid(6021120), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_174.benchmark_all_configs(*args, 6021120, grid=grid(6021120))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/zb/czbuvo3mdbqgu4vhbh2rqqgxj7l4q2oef5prb5xafk4aclfa4wgv.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_261
triton_poi_fused_split_with_sizes_175 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_175(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 6021120
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 47040
x1 = (xindex // 47040)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (47040 + x0 + (94080*x1)), None).to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_175.run(*args, 6021120, grid=grid(6021120), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_175.benchmark_all_configs(*args, 6021120, grid=grid(6021120))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/fo/cfobovxlpvp6vyrqqsfdz6jlgrpfwpemctlxrrgsykw3zih3en3k.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_254
triton_poi_fused__to_copy_176 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_176(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 19200
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_176.run(*args, 19200, grid=grid(19200), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_176.benchmark_all_configs(*args, 19200, grid=grid(19200))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/66/c66wd4retmi6azmfzfxmll5pmv3btijpcst56cykpuynmmhzhq23.py
# Original ATen: aten.cat
# aten.cat => cat_27
triton_poi_fused_cat_177 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_177(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2007040
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 15680
x1 = (xindex // 15680)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (31360*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 80, 14, 14), (15680, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 80, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_177.run(*args, 2007040, grid=grid(2007040), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_177.benchmark_all_configs(*args, 2007040, grid=grid(2007040))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/4x/c4xzenlwjgchxhkagftgwbnakluheaf3fbwuqeozpqdbo7czlfp4.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add
# aten._native_batch_norm_legit_functional => add_199, add_202, convert_element_type_256, convert_element_type_257, mul_306, mul_312, rsqrt_38, sub_38, var_mean_38
# aten.add => add_203
triton_poi_fused__native_batch_norm_legit_functional_add_178 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_178(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4014080
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 196) % 160
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp16 = tl.load(in_ptr5 + (x3), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 25088.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp17 = tmp15 + tmp16
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp17, None)
def get_args():
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_178.run(*args, 4014080, grid=grid(4014080), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_178.benchmark_all_configs(*args, 4014080, grid=grid(4014080))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/xt/cxt5mn4i2rohwchtz75fbvbykzhjdnjrf3qopbwsyyqyu5jqlzr6.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_306
triton_poi_fused__to_copy_179 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_179(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 153600
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), None)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None)
def get_args():
arg_0 = rand_strided((960, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((960, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_179.run(*args, 153600, grid=grid(153600), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_179.benchmark_all_configs(*args, 153600, grid=grid(153600))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/g5/cg527ugqaap3ndd26uaeekojscsqxex2xmpgnu55nr6zwyzt6pfr.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_237, add_238, add_239, convert_element_type_307, mul_364, mul_365, mul_366, mul_367, mul_368, rsqrt_45, squeeze_136, var_mean_45
triton_red_fused__native_batch_norm_legit_functional_180 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 32768],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_180(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 960
rnumel = 25088
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 196
r2 = (rindex // 196)
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tmp7 = tl.load(in_ptr1 + (x0), xmask)
tmp3 = 25088.0
tmp4 = tmp2 / tmp3
tmp5 = 0.1
tmp6 = tmp4 * tmp5
tmp8 = 0.9
tmp9 = tmp7 * tmp8
tmp10 = tmp6 + tmp9
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 196
r2 = (rindex // 196)
tmp11 = tl.load(in_ptr0 + (r1 + (196*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp12 = tmp11.to(tl.float32)
tmp13 = tmp12 - tmp4
tmp14 = tmp13 * tmp13
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
tmp15 = tl.sum(_tmp15, 1)[:, None]
tl.store(out_ptr1 + x0, tmp15, xmask)
tmp25 = tl.load(in_ptr2 + (x0), xmask)
tmp16 = 25088.0
tmp17 = tmp15 / tmp16
tmp18 = 1e-05
tmp19 = tmp17 + tmp18
tmp20 = tl.math.rsqrt(tmp19)
tmp21 = 1.0000398612827361
tmp22 = tmp17 * tmp21
tmp23 = 0.1
tmp24 = tmp22 * tmp23
tmp26 = 0.9
tmp27 = tmp25 * tmp26
tmp28 = tmp24 + tmp27
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)
def get_args():
arg_0 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_180.run(*args, 960, 25088, grid=grid(960), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_180.benchmark_all_configs(*args, 960, 25088, grid=grid(960))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/dy/cdyo4fykv4n564s3r3eahdxt2kdz67oeovon5bt5bw2lnq6iiowx.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub
# aten._native_batch_norm_legit_functional => add_237, add_240, convert_element_type_307, convert_element_type_308, mul_363, mul_369, rsqrt_45, sub_45, var_mean_45
# aten.add => add_322
# aten.clone => clone_36
# aten.fill => full_like_11
# aten.mul => mul_627, mul_628
# aten.sigmoid => sigmoid_75
# aten.sub => sub_121
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_181 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_181(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
xnumel = 24084480
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 196) % 960
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 25088.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tl.sigmoid(tmp15)
tmp17 = 1.0
tmp18 = tmp17 - tmp16
tmp19 = tmp15 * tmp18
tmp20 = tmp19 + tmp17
tmp21 = tmp16 * tmp20
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None)
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp21, None)
def get_args():
arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_181.run(*args, 24084480, grid=grid(24084480), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_181.benchmark_all_configs(*args, 24084480, grid=grid(24084480))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/3n/c3n35ckjp4qebx5kiwvfxztqxmlk7uwm2vsaphijnyencaajb4gh.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_311
triton_poi_fused__to_copy_182 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_182(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2160
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((240, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((240, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_182.run(*args, 2160, grid=grid(2160), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_182.benchmark_all_configs(*args, 2160, grid=grid(2160))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ow/cowqeeeuzgoqgwjjram7xczyaz465plzun33c7elszbys2k2zrzn.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_330
triton_poi_fused_split_with_sizes_183 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_183(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 6021120
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 47040
x1 = (xindex // 47040)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (188160*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_183.run(*args, 6021120, grid=grid(6021120), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_183.benchmark_all_configs(*args, 6021120, grid=grid(6021120))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/qx/cqxoadvgndjsqu4bltg3sq7l3fij2isfiihadwqnkb3ibmloxfyz.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_312
triton_poi_fused__to_copy_184 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_184(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 6000
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((240, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((240, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_184.run(*args, 6000, grid=grid(6000), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_184.benchmark_all_configs(*args, 6000, grid=grid(6000))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/vk/cvkedu5ynhqgwwdblwjx7rst7rz42xs7fl3vaof4iorofkc5cxam.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_335
triton_poi_fused_split_with_sizes_185 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_185(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 6021120
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 47040
x1 = (xindex // 47040)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (47040 + x0 + (188160*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_185.run(*args, 6021120, grid=grid(6021120), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_185.benchmark_all_configs(*args, 6021120, grid=grid(6021120))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/nv/cnv3es7i7p4naxsxsc56c2w7slsaxha5zh6gcrrgizefpo7zps5s.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_313
triton_poi_fused__to_copy_186 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_186(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 11760
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((240, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((240, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_186.run(*args, 11760, grid=grid(11760), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_186.benchmark_all_configs(*args, 11760, grid=grid(11760))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/3q/c3qkjjd4hxw6ylyfib2kl5k6u5atp2kcpplda3gzr6acmswhzitj.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_340
triton_poi_fused_split_with_sizes_187 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_187(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 6021120
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 47040
x1 = (xindex // 47040)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (94080 + x0 + (188160*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_187.run(*args, 6021120, grid=grid(6021120), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_187.benchmark_all_configs(*args, 6021120, grid=grid(6021120))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/l7/cl7v5hh5jck72nsc2u3t2ieraqxarpadxdpy5nt2hkbbqrgcinn7.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_314
triton_poi_fused__to_copy_188 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_188(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 19440
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((240, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((240, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_188.run(*args, 19440, grid=grid(19440), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_188.benchmark_all_configs(*args, 19440, grid=grid(19440))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/xm/cxm7drxzaido67j4aqf4vqqzkyw7ffgx32ygj3ddi3kmzigoauhg.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_345
triton_poi_fused_split_with_sizes_189 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_189(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 6021120
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 47040
x1 = (xindex // 47040)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (141120 + x0 + (188160*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_189.run(*args, 6021120, grid=grid(6021120), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_189.benchmark_all_configs(*args, 6021120, grid=grid(6021120))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ny/cnyr36xeex2s2grdckurzoliwfqkdakrsvrmgx5fd6rzjncjxedb.py
# Original ATen: aten.cat
# aten.cat => cat_34
triton_poi_fused_cat_190 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_190(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1505280
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 11760
x1 = (xindex // 11760)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (47040*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 240, 7, 7), (11760, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 240, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_190.run(*args, 1505280, grid=grid(1505280), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_190.benchmark_all_configs(*args, 1505280, grid=grid(1505280))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/z6/cz6ieqk3sfzltam3yeijfwdqrtao7w4zqdsqj32cpu6zoxg3irdq.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_242, add_243, add_244, convert_element_type_315, mul_372, mul_373, mul_374, mul_375, mul_376, rsqrt_46, squeeze_139, var_mean_46
triton_red_fused__native_batch_norm_legit_functional_191 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_191(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 960
rnumel = 6272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 49
r2 = (rindex // 49)
tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (47040*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tmp7 = tl.load(in_ptr1 + (x0), xmask)
tmp3 = 6272.0
tmp4 = tmp2 / tmp3
tmp5 = 0.1
tmp6 = tmp4 * tmp5
tmp8 = 0.9
tmp9 = tmp7 * tmp8
tmp10 = tmp6 + tmp9
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 49
r2 = (rindex // 49)
tmp11 = tl.load(in_ptr0 + (r1 + (49*x0) + (47040*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp12 = tmp11.to(tl.float32)
tmp13 = tmp12 - tmp4
tmp14 = tmp13 * tmp13
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
tmp15 = tl.sum(_tmp15, 1)[:, None]
tl.store(out_ptr1 + x0, tmp15, xmask)
tmp25 = tl.load(in_ptr2 + (x0), xmask)
tmp16 = 6272.0
tmp17 = tmp15 / tmp16
tmp18 = 1e-05
tmp19 = tmp17 + tmp18
tmp20 = tl.math.rsqrt(tmp19)
tmp21 = 1.0001594642002871
tmp22 = tmp17 * tmp21
tmp23 = 0.1
tmp24 = tmp22 * tmp23
tmp26 = 0.9
tmp27 = tmp25 * tmp26
tmp28 = tmp24 + tmp27
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)
def get_args():
arg_0 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_191.run(*args, 960, 6272, grid=grid(960), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_191.benchmark_all_configs(*args, 960, 6272, grid=grid(960))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/6y/c6ybh5xdfuwq6wdolxlmurgpxtgtn6q5shvsfwt5gf4rdpz7tas7.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu
# aten._native_batch_norm_legit_functional => add_242, add_245, convert_element_type_315, convert_element_type_316, mul_371, mul_377, rsqrt_46, sub_46, var_mean_46
# aten.mean => mean_12
# aten.silu => convert_element_type_317, convert_element_type_318, mul_378, sigmoid_49
triton_per_fused__native_batch_norm_legit_functional_mean_silu_192 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[131072, 64],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_mean_silu_192(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 122880
rnumel = 49
RBLOCK: tl.constexpr = 64
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r2 = rindex
x3 = xindex
x0 = xindex % 960
tmp0 = tl.load(in_ptr0 + (r2 + (49*x3)), rmask, other=0).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x0), None)
tmp4 = tl.load(in_ptr2 + (x0), None)
tmp11 = tl.load(in_ptr3 + (x0), None)
tmp13 = tl.load(in_ptr4 + (x0), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 6272.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tmp15.to(tl.float32)
tmp17 = tl.sigmoid(tmp16)
tmp18 = tmp16 * tmp17
tmp19 = tmp18.to(tl.float32)
tmp20 = tmp19.to(tl.float32)
tmp22 = tl.where(rmask, tmp20, 0)
tmp23 = tl.sum(tmp22, 1)[:, None]
tmp24 = 49.0
tmp25 = tmp23 / tmp24
tmp26 = tmp25.to(tl.float32)
tl.store(out_ptr0 + (r2 + (49*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask)
tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None)
def get_args():
arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_mean_silu_192.run(*args, 122880, 49, grid=grid(122880), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_mean_silu_192.benchmark_all_configs(*args, 122880, 49, grid=grid(122880))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/tl/ctlao5qdd2i3xcmliczx6rwjunhhywrgkmmolniswyndldbl73bm.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_320
triton_poi_fused__to_copy_193 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_193(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 76800
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((80, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((80, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_193.run(*args, 76800, grid=grid(76800), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_193.benchmark_all_configs(*args, 76800, grid=grid(76800))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/b4/cb4kobddnmpxrqkwjrmqktjtkp5r7xvtzsia2m5pxlvyngvl3gez.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_324
triton_poi_fused__to_copy_194 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_194(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 76800
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((960, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((960, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_194.run(*args, 76800, grid=grid(76800), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_194.benchmark_all_configs(*args, 76800, grid=grid(76800))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/5n/c5nkvlxqc6zdiwvvnuciiwj3rs2idxp5remuu4oe4qksftnrsipg.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_323
# aten.convolution => convolution_125
triton_poi_fused__to_copy_convolution_195 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_195(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 960
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_195.run(*args, 960, grid=grid(960), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_195.benchmark_all_configs(*args, 960, grid=grid(960))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/5g/c5gxudeqor4giltbrwmymcii3cqccfm32ibjrioytb2cth76irpn.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_323
# aten.convolution => convolution_125
triton_poi_fused__to_copy_convolution_196 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_196(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 122880
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 960
tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
tmp2 = tmp0 + tmp1
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None)
def get_args():
arg_0 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_196.run(*args, 122880, grid=grid(122880), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_196.benchmark_all_configs(*args, 122880, grid=grid(122880))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/u5/cu5blwuhanuznefsomv5iwcn6emdzdpn2ln3p27yycjvkiw6g2du.py
# Original ATen: aten.mul, aten.sigmoid, aten.silu
# aten.mul => mul_380
# aten.sigmoid => sigmoid_51
# aten.silu => convert_element_type_317, convert_element_type_318, mul_378, sigmoid_49
triton_poi_fused_mul_sigmoid_silu_197 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused_mul_sigmoid_silu_197(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 6021120
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x1 = (xindex // 49)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tmp6 = tl.sigmoid(tmp5)
tmp7 = tmp4 * tmp6
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None)
def get_args():
arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_mul_sigmoid_silu_197.run(*args, 6021120, grid=grid(6021120), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_mul_sigmoid_silu_197.benchmark_all_configs(*args, 6021120, grid=grid(6021120))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ow/cow7m27226bnhxi6lshzaavlwewllidrdyuy4wj4ablq5nvozskp.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_325
triton_poi_fused__to_copy_198 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_198(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 253440
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((264, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((264, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_198.run(*args, 253440, grid=grid(253440), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_198.benchmark_all_configs(*args, 253440, grid=grid(253440))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/zi/czig3biynqg6sd4t6h2tmyqez5ybs7vrp4v72f35wteotcxxgft7.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_247, add_248, add_249, convert_element_type_326, mul_382, mul_383, mul_384, mul_385, mul_386, rsqrt_47, squeeze_142, var_mean_47
triton_red_fused__native_batch_norm_legit_functional_199 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_199(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 264
rnumel = 6272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 49
r2 = (rindex // 49)
tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tmp7 = tl.load(in_ptr1 + (x0), xmask)
tmp3 = 6272.0
tmp4 = tmp2 / tmp3
tmp5 = 0.1
tmp6 = tmp4 * tmp5
tmp8 = 0.9
tmp9 = tmp7 * tmp8
tmp10 = tmp6 + tmp9
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 49
r2 = (rindex // 49)
tmp11 = tl.load(in_ptr0 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp12 = tmp11.to(tl.float32)
tmp13 = tmp12 - tmp4
tmp14 = tmp13 * tmp13
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
tmp15 = tl.sum(_tmp15, 1)[:, None]
tl.store(out_ptr1 + x0, tmp15, xmask)
tmp25 = tl.load(in_ptr2 + (x0), xmask)
tmp16 = 6272.0
tmp17 = tmp15 / tmp16
tmp18 = 1e-05
tmp19 = tmp17 + tmp18
tmp20 = tl.math.rsqrt(tmp19)
tmp21 = 1.0001594642002871
tmp22 = tmp17 * tmp21
tmp23 = 0.1
tmp24 = tmp22 * tmp23
tmp26 = 0.9
tmp27 = tmp25 * tmp26
tmp28 = tmp24 + tmp27
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)
def get_args():
arg_0 = rand_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_199.run(*args, 264, 6272, grid=grid(264), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_199.benchmark_all_configs(*args, 264, 6272, grid=grid(264))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/rz/crzhphlanp4xljhlyhd5oa64x25m2selaazalafwkl2z37flecta.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_247, add_250, convert_element_type_326, convert_element_type_327, mul_381, mul_387, rsqrt_47, sub_47, var_mean_47
triton_poi_fused__native_batch_norm_legit_functional_200 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_200(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1655808
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 49) % 264
tmp0 = tl.load(in_ptr0 + (x3), xmask).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), xmask)
tmp4 = tl.load(in_ptr2 + (x1), xmask)
tmp11 = tl.load(in_ptr3 + (x1), xmask)
tmp13 = tl.load(in_ptr4 + (x1), xmask)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 6272.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, xmask)
def get_args():
arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_200.run(*args, 1655808, grid=grid(1655808), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_200.benchmark_all_configs(*args, 1655808, grid=grid(1655808))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/7o/c7ocvzt6vmjqamqpyhoskmyrik2ig24uhlssffiqbirrszrdop2r.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_328
triton_poi_fused__to_copy_201 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[524288], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_201(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 418176
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((1584, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((1584, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_201.run(*args, 418176, grid=grid(418176), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_201.benchmark_all_configs(*args, 418176, grid=grid(418176))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/uo/cuocaty2j6iusx5tak7blhjvcj7z7suv537pcyhwgk3qfuh7te7r.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_252, add_253, add_254, convert_element_type_329, mul_389, mul_390, mul_391, mul_392, mul_393, rsqrt_48, squeeze_145, var_mean_48
triton_red_fused__native_batch_norm_legit_functional_202 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[2048, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_202(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 1584
rnumel = 6272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 49
r2 = (rindex // 49)
tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tmp7 = tl.load(in_ptr1 + (x0), xmask)
tmp3 = 6272.0
tmp4 = tmp2 / tmp3
tmp5 = 0.1
tmp6 = tmp4 * tmp5
tmp8 = 0.9
tmp9 = tmp7 * tmp8
tmp10 = tmp6 + tmp9
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 49
r2 = (rindex // 49)
tmp11 = tl.load(in_ptr0 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp12 = tmp11.to(tl.float32)
tmp13 = tmp12 - tmp4
tmp14 = tmp13 * tmp13
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
tmp15 = tl.sum(_tmp15, 1)[:, None]
tl.store(out_ptr1 + x0, tmp15, xmask)
tmp25 = tl.load(in_ptr2 + (x0), xmask)
tmp16 = 6272.0
tmp17 = tmp15 / tmp16
tmp18 = 1e-05
tmp19 = tmp17 + tmp18
tmp20 = tl.math.rsqrt(tmp19)
tmp21 = 1.0001594642002871
tmp22 = tmp17 * tmp21
tmp23 = 0.1
tmp24 = tmp22 * tmp23
tmp26 = 0.9
tmp27 = tmp25 * tmp26
tmp28 = tmp24 + tmp27
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)
def get_args():
arg_0 = rand_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_202.run(*args, 1584, 6272, grid=grid(1584), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_202.benchmark_all_configs(*args, 1584, 6272, grid=grid(1584))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/d2/cd2goj7lmhtwh5scpm6a2hg5fjalovx2pxu4k6raurql7xnwmae5.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub
# aten._native_batch_norm_legit_functional => add_252, add_255, convert_element_type_329, convert_element_type_330, mul_388, mul_394, rsqrt_48, sub_48, var_mean_48
# aten.add => add_317
# aten.clone => clone_39
# aten.fill => full_like_8
# aten.mul => mul_587, mul_588
# aten.sigmoid => sigmoid_72
# aten.sub => sub_105
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_203 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_203(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
xnumel = 9934848
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 49) % 1584
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 6272.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tl.sigmoid(tmp15)
tmp17 = 1.0
tmp18 = tmp17 - tmp16
tmp19 = tmp15 * tmp18
tmp20 = tmp19 + tmp17
tmp21 = tmp16 * tmp20
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None)
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp21, None)
def get_args():
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_203.run(*args, 9934848, grid=grid(9934848), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_203.benchmark_all_configs(*args, 9934848, grid=grid(9934848))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/4x/c4xvll4q3rptmkfezospulueatrj4cvvts5fa6rf327imnsbrund.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_333
triton_poi_fused__to_copy_204 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_204(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3564
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((396, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((396, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_204.run(*args, 3564, grid=grid(3564), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_204.benchmark_all_configs(*args, 3564, grid=grid(3564))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/oo/coohdf37vqteghs4tdccuqlheosk6dnovjd7xhliv6jnuxrt5q6k.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_356
triton_poi_fused_split_with_sizes_205 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_205(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2483712
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 19404
x1 = (xindex // 19404)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (77616*x1)), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, xmask)
def get_args():
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_205.run(*args, 2483712, grid=grid(2483712), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_205.benchmark_all_configs(*args, 2483712, grid=grid(2483712))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/q2/cq2qbiau5hauewftu26n75rmwbfaqfklvxrkizh35mvxm2pcsrku.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_334
triton_poi_fused__to_copy_206 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_206(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 9900
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((396, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((396, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_206.run(*args, 9900, grid=grid(9900), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_206.benchmark_all_configs(*args, 9900, grid=grid(9900))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/zt/cztezqwtd34ik4tsx6erhha7cnfeljbbbwidrvnml2yi26gx6sbr.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_361
triton_poi_fused_split_with_sizes_207 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_207(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2483712
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 19404
x1 = (xindex // 19404)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (19404 + x0 + (77616*x1)), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, xmask)
def get_args():
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_207.run(*args, 2483712, grid=grid(2483712), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_207.benchmark_all_configs(*args, 2483712, grid=grid(2483712))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/a5/ca5stuyejdo4icolodntofflni5tgdpo5ewbossduq7phpjybihc.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_335
triton_poi_fused__to_copy_208 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_208(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 19404
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((396, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((396, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_208.run(*args, 19404, grid=grid(19404), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_208.benchmark_all_configs(*args, 19404, grid=grid(19404))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/rz/crzmv3373ssmmao7rhii34dz6qyf2kesyjmask7sgbcayzblrpb7.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_366
triton_poi_fused_split_with_sizes_209 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_209(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2483712
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 19404
x1 = (xindex // 19404)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (38808 + x0 + (77616*x1)), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, xmask)
def get_args():
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_209.run(*args, 2483712, grid=grid(2483712), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_209.benchmark_all_configs(*args, 2483712, grid=grid(2483712))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ua/cuahjxhhqgvs2pfawkys4ycn4cv26634v6swah3d6itp3ay5rix6.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_336
triton_poi_fused__to_copy_210 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_210(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 32076
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((396, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((396, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_210.run(*args, 32076, grid=grid(32076), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_210.benchmark_all_configs(*args, 32076, grid=grid(32076))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/v7/cv7bjv2cw2xs5mbq2x7bddhtciqwe66f4efi6r3hekyvffle4el4.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_371
triton_poi_fused_split_with_sizes_211 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_211(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2483712
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 19404
x1 = (xindex // 19404)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (58212 + x0 + (77616*x1)), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, xmask)
def get_args():
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_211.run(*args, 2483712, grid=grid(2483712), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_211.benchmark_all_configs(*args, 2483712, grid=grid(2483712))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/n2/cn2whdfczqvp7tj52zmht7aewohn4yzq7pmnqwutp7hyu2nidelg.py
# Original ATen: aten.cat
# aten.cat => cat_35
triton_poi_fused_cat_212 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_212(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2483712
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 19404
x1 = (xindex // 19404)
tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32)
tl.store(out_ptr0 + (x0 + (77616*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask)
def get_args():
arg_0 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 396, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_212.run(*args, 2483712, grid=grid(2483712), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_212.benchmark_all_configs(*args, 2483712, grid=grid(2483712))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/rs/crsyf734jiflnmzza27lk4h5tk3zflzcv6g64yl5p4sitpmqg5lb.py
# Original ATen: aten.cat
# aten.cat => cat_35
triton_poi_fused_cat_213 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_213(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2483712
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 19404
x1 = (xindex // 19404)
tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32)
tl.store(out_ptr0 + (x0 + (77616*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask)
def get_args():
arg_0 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 396, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_213.run(*args, 2483712, grid=grid(2483712), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_213.benchmark_all_configs(*args, 2483712, grid=grid(2483712))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/5w/c5w3sqhxbgvomykhq2gsbk2tldu7xewurtz7sp2eii33b3skhwth.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu
# aten._native_batch_norm_legit_functional => add_257, add_260, convert_element_type_337, convert_element_type_338, mul_396, mul_402, rsqrt_49, sub_49, var_mean_49
# aten.mean => mean_13
# aten.silu => convert_element_type_339, convert_element_type_340, mul_403, sigmoid_53
triton_per_fused__native_batch_norm_legit_functional_mean_silu_214 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[262144, 64],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_mean_silu_214(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 202752
rnumel = 49
RBLOCK: tl.constexpr = 64
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r2 = rindex
x3 = xindex
x0 = xindex % 1584
tmp0 = tl.load(in_ptr0 + (r2 + (49*x3)), rmask, other=0).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x0), None)
tmp4 = tl.load(in_ptr2 + (x0), None)
tmp11 = tl.load(in_ptr3 + (x0), None)
tmp13 = tl.load(in_ptr4 + (x0), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 6272.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tmp15.to(tl.float32)
tmp17 = tl.sigmoid(tmp16)
tmp18 = tmp16 * tmp17
tmp19 = tmp18.to(tl.float32)
tmp20 = tmp19.to(tl.float32)
tmp22 = tl.where(rmask, tmp20, 0)
tmp23 = tl.sum(tmp22, 1)[:, None]
tmp24 = 49.0
tmp25 = tmp23 / tmp24
tmp26 = tmp25.to(tl.float32)
tl.store(out_ptr0 + (r2 + (49*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask)
tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None)
def get_args():
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_mean_silu_214.run(*args, 202752, 49, grid=grid(202752), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_mean_silu_214.benchmark_all_configs(*args, 202752, 49, grid=grid(202752))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/73/c73f3yeyp4owv2uj77f6pdjw5heox2q6yo4kfkrqenvhw4yqhomk.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_342
triton_poi_fused__to_copy_215 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_215(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 209088
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((132, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((132, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_215.run(*args, 209088, grid=grid(209088), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_215.benchmark_all_configs(*args, 209088, grid=grid(209088))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ca/ccagnt2eck6mlzrforr72k455qyt2o7bmlcxrdko6fbwlxsyixff.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_341
# aten.convolution => convolution_132
triton_poi_fused__to_copy_convolution_216 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[256], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_216(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 132
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((132,), (1,), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((132,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_216.run(*args, 132, grid=grid(132), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_216.benchmark_all_configs(*args, 132, grid=grid(132))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/es/ces42l67kcrpgvq2z54f4gf5my7z5wnxr2mun5sm744xvtl7qmr3.py
# Original ATen: aten._to_copy, aten.convolution, aten.silu
# aten._to_copy => convert_element_type_341
# aten.convolution => convolution_132
# aten.silu => convert_element_type_343, convert_element_type_344, mul_404, sigmoid_54
triton_poi_fused__to_copy_convolution_silu_217 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_silu_217(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16896
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 132
tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp2 = tmp0 + tmp1
tmp3 = tmp2.to(tl.float32)
tmp4 = tl.sigmoid(tmp3)
tmp5 = tmp3 * tmp4
tmp6 = tmp5.to(tl.float32)
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, xmask)
def get_args():
arg_0 = rand_strided((128, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((132,), (1,), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_silu_217.run(*args, 16896, grid=grid(16896), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_silu_217.benchmark_all_configs(*args, 16896, grid=grid(16896))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/xv/cxvmi6ffjgu4jnqxjbleamncx3jw3a5x6bteybr2qp7fyy4to7ad.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_346
triton_poi_fused__to_copy_218 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_218(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 209088
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((1584, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((1584, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_218.run(*args, 209088, grid=grid(209088), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_218.benchmark_all_configs(*args, 209088, grid=grid(209088))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/xn/cxnwubuxft4fv4egkcmiwmgz7afk4w2atiiqg6oyapkchhvtktlp.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_345
# aten.convolution => convolution_133
triton_poi_fused__to_copy_convolution_219 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_219(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1584
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_219.run(*args, 1584, grid=grid(1584), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_219.benchmark_all_configs(*args, 1584, grid=grid(1584))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/zi/cziku3wkydhqpprjhx3spm364d6fncgicrpenyhycjc6uhdvng7k.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_345
# aten.convolution => convolution_133
triton_poi_fused__to_copy_convolution_220 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_220(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 202752
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 1584
tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
tmp2 = tmp0 + tmp1
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None)
def get_args():
arg_0 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_220.run(*args, 202752, grid=grid(202752), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_220.benchmark_all_configs(*args, 202752, grid=grid(202752))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/qo/cqo4la4ss4kx6zb5vv53wk2l4cvgi273m7sogbmwaw3v62ktkznm.py
# Original ATen: aten.mul, aten.sigmoid, aten.silu
# aten.mul => mul_405
# aten.sigmoid => sigmoid_55
# aten.silu => convert_element_type_339, convert_element_type_340, mul_403, sigmoid_53
triton_poi_fused_mul_sigmoid_silu_221 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused_mul_sigmoid_silu_221(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 9934848
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x1 = (xindex // 49)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tmp6 = tl.sigmoid(tmp5)
tmp7 = tmp4 * tmp6
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None)
def get_args():
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_mul_sigmoid_silu_221.run(*args, 9934848, grid=grid(9934848), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_mul_sigmoid_silu_221.benchmark_all_configs(*args, 9934848, grid=grid(9934848))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/g3/cg37tw75k3qhu23v54dt4szux7ntig2ituyeprsnltkm3nfiarqp.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_374
triton_poi_fused_split_with_sizes_222 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_222(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4967424
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 38808
x1 = (xindex // 38808)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (77616*x1)), xmask).to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask)
def get_args():
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_222.run(*args, 4967424, grid=grid(4967424), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_222.benchmark_all_configs(*args, 4967424, grid=grid(4967424))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/qk/cqk2scd5kzvelb4wc76gqtuwya5pxfoqhhpq26ga2dggkjyf6t72.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_375
triton_poi_fused_split_with_sizes_223 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_223(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4967424
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 38808
x1 = (xindex // 38808)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (38808 + x0 + (77616*x1)), xmask).to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask)
def get_args():
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_223.run(*args, 4967424, grid=grid(4967424), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_223.benchmark_all_configs(*args, 4967424, grid=grid(4967424))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/5z/c5zsdbhynt6xdafkaydaa3qemqk6v4xjjheiwafzrfldvlj3ihqh.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_347
triton_poi_fused__to_copy_224 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_224(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 104544
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_224.run(*args, 104544, grid=grid(104544), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_224.benchmark_all_configs(*args, 104544, grid=grid(104544))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ca/ccakgu22ynocwwbf5iwb4um7n6tcg3nlbhmehr4k36wdy4nrskss.py
# Original ATen: aten.cat
# aten.cat => cat_36
triton_poi_fused_cat_225 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[1048576], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_225(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 827904
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 6468
x1 = (xindex // 6468)
tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32)
tl.store(out_ptr0 + (x0 + (12936*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask)
def get_args():
arg_0 = rand_strided((128, 132, 7, 7), (6468, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 132, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_225.run(*args, 827904, grid=grid(827904), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_225.benchmark_all_configs(*args, 827904, grid=grid(827904))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/pp/cpprz7zopeigvjazcl7gems4pi2vh33qfi655qdlyuddxr7rvwr3.py
# Original ATen: aten.cat
# aten.cat => cat_36
triton_poi_fused_cat_226 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[1048576], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_226(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 827904
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 6468
x1 = (xindex // 6468)
tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32)
tl.store(out_ptr0 + (x0 + (12936*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask)
def get_args():
arg_0 = rand_strided((128, 132, 7, 7), (6468, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 132, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_226.run(*args, 827904, grid=grid(827904), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_226.benchmark_all_configs(*args, 827904, grid=grid(827904))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ws/cwsi7gb6vfnxgobitrcoooobibx3jugdjfpqbmhity4caskyzyyn.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add
# aten._native_batch_norm_legit_functional => add_262, add_265, convert_element_type_349, convert_element_type_350, mul_406, mul_412, rsqrt_50, sub_50, var_mean_50
# aten.add => add_266
triton_poi_fused__native_batch_norm_legit_functional_add_227 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_227(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1655808
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 49) % 264
tmp0 = tl.load(in_ptr0 + (x3), xmask).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), xmask)
tmp4 = tl.load(in_ptr2 + (x1), xmask)
tmp11 = tl.load(in_ptr3 + (x1), xmask)
tmp13 = tl.load(in_ptr4 + (x1), xmask)
tmp16 = tl.load(in_ptr5 + (x3), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 6272.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp17 = tmp15 + tmp16
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp17, xmask)
def get_args():
arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_227.run(*args, 1655808, grid=grid(1655808), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_227.benchmark_all_configs(*args, 1655808, grid=grid(1655808))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/7p/c7pjks5p4ph6nsw372btix32wdljqepkkdc72dgxj6ycvs5rdkcd.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_397
triton_poi_fused__to_copy_228 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[524288], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_228(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 405504
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), None)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None)
def get_args():
arg_0 = rand_strided((1536, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((1536, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_228.run(*args, 405504, grid=grid(405504), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_228.benchmark_all_configs(*args, 405504, grid=grid(405504))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/om/com4odhqbzh6wdohnvxyqvr7ulmhf2d2ytjjg22jyjf67zj3ocbo.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_300, add_301, add_302, convert_element_type_398, mul_464, mul_465, mul_466, mul_467, mul_468, rsqrt_57, squeeze_172, var_mean_57
triton_red_fused__native_batch_norm_legit_functional_229 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[2048, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_229(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 1536
rnumel = 6272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 49
r2 = (rindex // 49)
tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (75264*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tmp7 = tl.load(in_ptr1 + (x0), xmask)
tmp3 = 6272.0
tmp4 = tmp2 / tmp3
tmp5 = 0.1
tmp6 = tmp4 * tmp5
tmp8 = 0.9
tmp9 = tmp7 * tmp8
tmp10 = tmp6 + tmp9
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 49
r2 = (rindex // 49)
tmp11 = tl.load(in_ptr0 + (r1 + (49*x0) + (75264*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp12 = tmp11.to(tl.float32)
tmp13 = tmp12 - tmp4
tmp14 = tmp13 * tmp13
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
tmp15 = tl.sum(_tmp15, 1)[:, None]
tl.store(out_ptr1 + x0, tmp15, xmask)
tmp25 = tl.load(in_ptr2 + (x0), xmask)
tmp16 = 6272.0
tmp17 = tmp15 / tmp16
tmp18 = 1e-05
tmp19 = tmp17 + tmp18
tmp20 = tl.math.rsqrt(tmp19)
tmp21 = 1.0001594642002871
tmp22 = tmp17 * tmp21
tmp23 = 0.1
tmp24 = tmp22 * tmp23
tmp26 = 0.9
tmp27 = tmp25 * tmp26
tmp28 = tmp24 + tmp27
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)
def get_args():
arg_0 = rand_strided((1, 1536, 1, 1), (1536, 1, 1536, 1536), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((1, 1536, 1, 1), (1536, 1, 1536, 1536), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_229.run(*args, 1536, 6272, grid=grid(1536), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_229.benchmark_all_configs(*args, 1536, 6272, grid=grid(1536))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/wa/cwaeb6f5uogsexjap772fuwlvl2vebz5e54hbg5l3px6d7azdjsm.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.relu, aten.threshold_backward, aten.view
# aten._native_batch_norm_legit_functional => add_300, add_303, convert_element_type_398, convert_element_type_399, mul_463, mul_469, rsqrt_57, sub_57, var_mean_57
# aten.mean => mean_16
# aten.relu => relu_6
# aten.threshold_backward => le
# aten.view => view
triton_per_fused__native_batch_norm_legit_functional_mean_relu_threshold_backward_view_230 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[262144, 64],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*i1', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_mean_relu_threshold_backward_view_230(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr1, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 196608
rnumel = 49
RBLOCK: tl.constexpr = 64
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r2 = rindex
x3 = xindex
x0 = xindex % 1536
tmp0 = tl.load(in_ptr0 + (r2 + (49*x3)), rmask, other=0).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x0), None)
tmp4 = tl.load(in_ptr2 + (x0), None)
tmp11 = tl.load(in_ptr3 + (x0), None)
tmp13 = tl.load(in_ptr4 + (x0), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 6272.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tl.where(0 != 0, 0, tl.where(0 > tmp15, 0, tmp15))
tmp17 = 0.0
tmp18 = tmp16 <= tmp17
tmp19 = tmp16.to(tl.float32)
tmp21 = tl.where(rmask, tmp19, 0)
tmp22 = tl.sum(tmp21, 1)[:, None]
tmp23 = 49.0
tmp24 = tmp22 / tmp23
tmp25 = tmp24.to(tl.float32)
tl.store(out_ptr1 + (r2 + (49*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp18, rmask)
tl.store(out_ptr3 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp25, None)
def get_args():
arg_0 = rand_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 1536, 1, 1), (1536, 1, 1536, 1536), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 1536, 1, 1), (1536, 1, 1536, 1536), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda:0', dtype=torch.bool)
arg_6 = rand_strided((128, 1536), (1536, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_mean_relu_threshold_backward_view_230.run(*args, 196608, 49, grid=grid(196608), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_mean_relu_threshold_backward_view_230.benchmark_all_configs(*args, 196608, 49, grid=grid(196608))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/hq/chqge2y77s52vahqshrpnxrmtqywhsxmxfiwmel7jsbqfdldligh.py
# Original ATen: aten._to_copy, aten.t
# aten._to_copy => convert_element_type_401
# aten.t => permute_1
triton_poi_fused__to_copy_t_231 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_t_231(in_ptr0, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
xnumel = 1536000
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), None)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None)
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None)
def get_args():
arg_0 = rand_strided((1000, 1536), (1536, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((1000, 1536), (1536, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((1000, 1536), (1536, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_t_231.run(*args, 1536000, grid=grid(1536000), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_t_231.benchmark_all_configs(*args, 1536000, grid=grid(1536000))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/kk/ckkhmi7qwvpeybxqkogp5ah6kylzhjoxai5hqgfzedjta2pj3hyg.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_400
triton_poi_fused__to_copy_232 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_232(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1000
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((1000,), (1,), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((1000,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_232.run(*args, 1000, grid=grid(1000), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_232.benchmark_all_configs(*args, 1000, grid=grid(1000))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/gb/cgbszhl6qhqzdxycxrv6h7svhilojvsjjhnimvf6vwfj77spvald.py
# Original ATen: aten.add
# aten.add => add
triton_poi_fused_add_233 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[1], filename=__file__, meta={'signature': {0: '*i64', 1: '*i64', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused_add_233(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
tmp0 = tl.load(in_ptr0 + (0))
tmp1 = tl.broadcast_to(tmp0, [XBLOCK])
tmp2 = 1
tmp3 = tmp1 + tmp2
tl.store(out_ptr0 + (0 + tl.zeros([XBLOCK], tl.int32)), tmp3, None)
def get_args():
arg_0 = rand_strided((), (), device='cuda:0', dtype=torch.int64)
arg_1 = rand_strided((), (), device='cuda:0', dtype=torch.int64)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_add_233.run(*args, 1, grid=grid(1), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_add_233.benchmark_all_configs(*args, 1, grid=grid(1))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
async_compile.wait(globals())
del async_compile
def call(args):
primals_1, primals_2, primals_3, primals_4, primals_5, primals_6, primals_7, primals_8, primals_9, primals_10, primals_11, primals_12, primals_13, primals_14, primals_15, primals_16, primals_17, primals_18, primals_19, primals_20, primals_21, primals_22, primals_23, primals_24, primals_25, primals_26, primals_27, primals_28, primals_29, primals_30, primals_31, primals_32, primals_33, primals_34, primals_35, primals_36, primals_37, primals_38, primals_39, primals_40, primals_41, primals_42, primals_43, primals_44, primals_45, primals_46, primals_47, primals_48, primals_49, primals_50, primals_51, primals_52, primals_53, primals_54, primals_55, primals_56, primals_57, primals_58, primals_59, primals_60, primals_61, primals_62, primals_63, primals_64, primals_65, primals_66, primals_67, primals_68, primals_69, primals_70, primals_71, primals_72, primals_73, primals_74, primals_75, primals_76, primals_77, primals_78, primals_79, primals_80, primals_81, primals_82, primals_83, primals_84, primals_85, primals_86, primals_87, primals_88, primals_89, primals_90, primals_91, primals_92, primals_93, primals_94, primals_95, primals_96, primals_97, primals_98, primals_99, primals_100, primals_101, primals_102, primals_103, primals_104, primals_105, primals_106, primals_107, primals_108, primals_109, primals_110, primals_111, primals_112, primals_113, primals_114, primals_115, primals_116, primals_117, primals_118, primals_119, primals_120, primals_121, primals_122, primals_123, primals_124, primals_125, primals_126, primals_127, primals_128, primals_129, primals_130, primals_131, primals_132, primals_133, primals_134, primals_135, primals_136, primals_137, primals_138, primals_139, primals_140, primals_141, primals_142, primals_143, primals_144, primals_145, primals_146, primals_147, primals_148, primals_149, primals_150, primals_151, primals_152, primals_153, primals_154, primals_155, primals_156, primals_157, primals_158, primals_159, primals_160, primals_161, primals_162, primals_163, primals_164, primals_165, primals_166, primals_167, primals_168, primals_169, primals_170, primals_171, primals_172, primals_173, primals_174, primals_175, primals_176, primals_177, primals_178, primals_179, primals_180, primals_181, primals_182, primals_183, primals_184, primals_185, primals_186, primals_187, primals_188, primals_189, primals_190, primals_191, primals_192, primals_193, primals_194, primals_195, primals_196, primals_197, primals_198, primals_199, primals_200, primals_201, primals_202, primals_203, primals_204, primals_205, primals_206, primals_207, primals_208, primals_209, primals_210, primals_211, primals_212, primals_213, primals_214, primals_215, primals_216, primals_217, primals_218, primals_219, primals_220, primals_221, primals_222, primals_223, primals_224, primals_225, primals_226, primals_227, primals_228, primals_229, primals_230, primals_231, primals_232, primals_233, primals_234, primals_235, primals_236, primals_237, primals_238, primals_239, primals_240, primals_241, primals_242, primals_243, primals_244, primals_245, primals_246, primals_247, primals_248, primals_249, primals_250, primals_251, primals_252, primals_253, primals_254, primals_255, primals_256, primals_257, primals_258, primals_259, primals_260, primals_261, primals_262, primals_263, primals_264, primals_265, primals_266, primals_267, primals_268, primals_269, primals_270, primals_271, primals_272, primals_273, primals_274, primals_275, primals_276, primals_277, primals_278, primals_279, primals_280, primals_281, primals_282, primals_283, primals_284, primals_285, primals_286, primals_287, primals_288, primals_289, primals_290, primals_291, primals_292, primals_293, primals_294, primals_295, primals_296, primals_297, primals_298, primals_299, primals_300, primals_301, primals_302, primals_303, primals_304, primals_305, primals_306, primals_307, primals_308, primals_309, primals_310, primals_311, primals_312, primals_313, primals_314, primals_315, primals_316, primals_317, primals_318, primals_319, primals_320, primals_321, primals_322, primals_323, primals_324, primals_325, primals_326, primals_327, primals_328, primals_329, primals_330, primals_331, primals_332, primals_333, primals_334, primals_335, primals_336, primals_337, primals_338, primals_339, primals_340, primals_341, primals_342, primals_343, primals_344, primals_345, primals_346, primals_347, primals_348, primals_349, primals_350, primals_351, primals_352, primals_353, primals_354, primals_355, primals_356, primals_357, primals_358, primals_359, primals_360, primals_361, primals_362, primals_363, primals_364, primals_365, primals_366, primals_367, primals_368, primals_369, primals_370, primals_371, primals_372, primals_373, primals_374, primals_375, primals_376, primals_377, primals_378, primals_379, primals_380, primals_381, primals_382, primals_383, primals_384, primals_385, primals_386, primals_387, primals_388, primals_389, primals_390, primals_391, primals_392, primals_393, primals_394, primals_395, primals_396, primals_397, primals_398, primals_399, primals_400, primals_401, primals_402, primals_403, primals_404, primals_405, primals_406, primals_407, primals_408, primals_409, primals_410, primals_411, primals_412, primals_413, primals_414, primals_415, primals_416, primals_417, primals_418, primals_419, primals_420, primals_421, primals_422, primals_423, primals_424, primals_425, primals_426, primals_427, primals_428, primals_429, primals_430, primals_431, primals_432, primals_433, primals_434, primals_435, primals_436, primals_437, primals_438, primals_439, primals_440, primals_441, primals_442, primals_443, primals_444, primals_445, primals_446, primals_447, primals_448, primals_449, primals_450, primals_451, primals_452, primals_453, primals_454, primals_455, primals_456, primals_457, primals_458, primals_459, primals_460, primals_461, primals_462, primals_463, primals_464, primals_465, primals_466, primals_467, primals_468, primals_469, primals_470, primals_471, primals_472, primals_473, primals_474, primals_475, primals_476, primals_477, primals_478, primals_479, primals_480 = args
args.clear()
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0) # no-op to ensure context
buf0 = empty_strided((32, 3, 3, 3), (27, 9, 3, 1), device='cuda', dtype=torch.float16)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_0.run(primals_117, buf0, 864, grid=grid(864), stream=stream0)
del primals_117
buf1 = empty_strided((128, 3, 224, 224), (150528, 50176, 224, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_1.run(primals_480, buf1, 19267584, grid=grid(19267584), stream=stream0)
del primals_480
buf2 = extern_kernels.convolution(buf1, buf0, stride=(2, 2), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf2, (128, 32, 112, 112), (401408, 12544, 112, 1))
buf3 = empty_strided((1, 32, 1, 1, 14), (448, 1, 448, 448, 32), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_2.run(buf2, buf3, 448, 114688, grid=grid(448), stream=stream0)
buf4 = empty_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda', dtype=torch.float32)
buf5 = buf4; del buf4 # reuse
buf9 = empty_strided((32, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_3.run(buf5, buf3, primals_307, buf9, 32, 14, grid=grid(32), stream=stream0)
del primals_307
buf6 = buf3; del buf3 # reuse
triton_red_fused__native_batch_norm_legit_functional_4.run(buf2, buf5, buf6, 448, 114688, grid=grid(448), stream=stream0)
buf7 = empty_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda', dtype=torch.float32)
buf8 = empty_strided((32, ), (1, ), device='cuda', dtype=torch.float32)
buf10 = empty_strided((32, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_5.run(buf6, primals_308, buf7, buf8, buf10, 32, 14, grid=grid(32), stream=stream0)
del primals_308
buf11 = empty_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_relu_6.run(buf2, buf5, buf7, primals_1, primals_2, buf11, 51380224, grid=grid(51380224), stream=stream0)
del primals_2
buf12 = empty_strided((32, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_7.run(primals_118, buf12, 288, grid=grid(288), stream=stream0)
del primals_118
buf13 = extern_kernels.convolution(buf11, buf12, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=32, bias=None)
assert_size_stride(buf13, (128, 32, 112, 112), (401408, 12544, 112, 1))
buf14 = buf6; del buf6 # reuse
triton_red_fused__native_batch_norm_legit_functional_2.run(buf13, buf14, 448, 114688, grid=grid(448), stream=stream0)
buf15 = buf7; del buf7 # reuse
buf16 = buf15; del buf15 # reuse
buf20 = empty_strided((32, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_3.run(buf16, buf14, primals_310, buf20, 32, 14, grid=grid(32), stream=stream0)
del primals_310
buf17 = buf14; del buf14 # reuse
triton_red_fused__native_batch_norm_legit_functional_4.run(buf13, buf16, buf17, 448, 114688, grid=grid(448), stream=stream0)
buf18 = empty_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda', dtype=torch.float32)
buf19 = empty_strided((32, ), (1, ), device='cuda', dtype=torch.float32)
buf21 = empty_strided((32, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_5.run(buf17, primals_311, buf18, buf19, buf21, 32, 14, grid=grid(32), stream=stream0)
del primals_311
buf22 = empty_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_relu_6.run(buf13, buf16, buf18, primals_3, primals_4, buf22, 51380224, grid=grid(51380224), stream=stream0)
del primals_4
buf23 = empty_strided((32, 32, 1, 1), (32, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_8.run(primals_119, buf23, 1024, grid=grid(1024), stream=stream0)
del primals_119
buf24 = extern_kernels.convolution(buf22, buf23, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf24, (128, 32, 112, 112), (401408, 12544, 112, 1))
buf25 = buf17; del buf17 # reuse
triton_red_fused__native_batch_norm_legit_functional_2.run(buf24, buf25, 448, 114688, grid=grid(448), stream=stream0)
buf26 = buf18; del buf18 # reuse
buf27 = buf26; del buf26 # reuse
buf31 = empty_strided((32, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_3.run(buf27, buf25, primals_313, buf31, 32, 14, grid=grid(32), stream=stream0)
del primals_313
buf28 = buf25; del buf25 # reuse
triton_red_fused__native_batch_norm_legit_functional_4.run(buf24, buf27, buf28, 448, 114688, grid=grid(448), stream=stream0)
buf29 = empty_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda', dtype=torch.float32)
buf30 = empty_strided((32, ), (1, ), device='cuda', dtype=torch.float32)
buf32 = empty_strided((32, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_5.run(buf28, primals_314, buf29, buf30, buf32, 32, 14, grid=grid(32), stream=stream0)
del buf28
del primals_314
buf33 = empty_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_9.run(buf24, buf27, buf29, primals_5, primals_6, buf11, buf33, 51380224, grid=grid(51380224), stream=stream0)
del buf29
del primals_6
buf34 = empty_strided((128, 16, 112, 112), (200704, 12544, 112, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_split_with_sizes_10.run(buf33, buf34, 25690112, grid=grid(25690112), stream=stream0)
buf35 = empty_strided((128, 16, 112, 112), (200704, 12544, 112, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_split_with_sizes_11.run(buf33, buf35, 25690112, grid=grid(25690112), stream=stream0)
del buf33
buf36 = empty_strided((96, 16, 1, 1), (16, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_12.run(primals_120, buf36, 1536, grid=grid(1536), stream=stream0)
del primals_120
buf37 = extern_kernels.convolution(buf34, buf36, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf37, (128, 96, 112, 112), (1204224, 12544, 112, 1))
buf38 = empty_strided((96, 16, 1, 1), (16, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_12.run(primals_121, buf38, 1536, grid=grid(1536), stream=stream0)
del primals_121
buf39 = extern_kernels.convolution(buf35, buf38, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf39, (128, 96, 112, 112), (1204224, 12544, 112, 1))
buf42 = empty_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda', dtype=torch.float16)
buf40 = as_strided(buf42, (128, 96, 112, 112), (2408448, 12544, 112, 1)) # alias
triton_poi_fused_cat_13.run(buf37, buf40, 154140672, grid=grid(154140672), stream=stream0)
del buf37
buf41 = as_strided(buf42, (128, 96, 112, 112), (2408448, 12544, 112, 1), 1204224) # alias
triton_poi_fused_cat_13.run(buf39, buf41, 154140672, grid=grid(154140672), stream=stream0)
del buf39
buf43 = empty_strided((1, 192, 1, 1, 13), (2496, 1, 2496, 2496, 192), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_14.run(buf42, buf43, 2496, 123511, grid=grid(2496), stream=stream0)
buf44 = empty_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda', dtype=torch.float32)
buf45 = buf44; del buf44 # reuse
buf49 = empty_strided((192, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_15.run(buf45, buf43, primals_316, buf49, 192, 13, grid=grid(192), stream=stream0)
del primals_316
buf46 = buf43; del buf43 # reuse
triton_red_fused__native_batch_norm_legit_functional_16.run(buf42, buf45, buf46, 2496, 123511, grid=grid(2496), stream=stream0)
buf47 = empty_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda', dtype=torch.float32)
buf48 = empty_strided((192, ), (1, ), device='cuda', dtype=torch.float32)
buf50 = empty_strided((192, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_17.run(buf46, primals_317, buf47, buf48, buf50, 192, 13, grid=grid(192), stream=stream0)
del buf46
del primals_317
buf51 = empty_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda', dtype=torch.float16)
buf1155 = empty_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda', dtype=torch.bool)
triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_18.run(buf42, buf45, buf47, primals_7, primals_8, buf51, buf1155, 308281344, grid=grid(308281344), stream=stream0)
del primals_8
buf52 = empty_strided((64, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_19.run(primals_122, buf52, 576, grid=grid(576), stream=stream0)
del primals_122
buf53 = extern_kernels.convolution(as_strided(buf51, (128, 64, 112, 112), (2408448, 12544, 112, 1)), buf52, stride=(2, 2), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=64, bias=None)
assert_size_stride(buf53, (128, 64, 56, 56), (200704, 3136, 56, 1))
buf54 = empty_strided((64, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_20.run(primals_123, buf54, 1600, grid=grid(1600), stream=stream0)
del primals_123
buf55 = extern_kernels.convolution(as_strided(buf51, (128, 64, 112, 112), (2408448, 12544, 112, 1), 802816), buf54, stride=(2, 2), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=64, bias=None)
assert_size_stride(buf55, (128, 64, 56, 56), (200704, 3136, 56, 1))
buf56 = empty_strided((64, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_21.run(primals_124, buf56, 3136, grid=grid(3136), stream=stream0)
del primals_124
buf57 = extern_kernels.convolution(as_strided(buf51, (128, 64, 112, 112), (2408448, 12544, 112, 1), 1605632), buf56, stride=(2, 2), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=64, bias=None)
assert_size_stride(buf57, (128, 64, 56, 56), (200704, 3136, 56, 1))
buf61 = empty_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda', dtype=torch.float16)
buf58 = as_strided(buf61, (128, 64, 56, 56), (602112, 3136, 56, 1)) # alias
triton_poi_fused_cat_22.run(buf53, buf58, 25690112, grid=grid(25690112), stream=stream0)
del buf53
buf59 = as_strided(buf61, (128, 64, 56, 56), (602112, 3136, 56, 1), 200704) # alias
triton_poi_fused_cat_22.run(buf55, buf59, 25690112, grid=grid(25690112), stream=stream0)
del buf55
buf60 = as_strided(buf61, (128, 64, 56, 56), (602112, 3136, 56, 1), 401408) # alias
triton_poi_fused_cat_22.run(buf57, buf60, 25690112, grid=grid(25690112), stream=stream0)
del buf57
buf62 = empty_strided((1, 192, 1, 1, 4), (768, 1, 768, 768, 192), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_23.run(buf61, buf62, 768, 100352, grid=grid(768), stream=stream0)
buf63 = buf47; del buf47 # reuse
buf64 = buf63; del buf63 # reuse
buf68 = empty_strided((192, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_24.run(buf64, buf62, primals_319, buf68, 192, 4, grid=grid(192), stream=stream0)
del primals_319
buf65 = buf62; del buf62 # reuse
triton_red_fused__native_batch_norm_legit_functional_25.run(buf61, buf64, buf65, 768, 100352, grid=grid(768), stream=stream0)
buf66 = empty_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda', dtype=torch.float32)
buf67 = empty_strided((192, ), (1, ), device='cuda', dtype=torch.float32)
buf69 = empty_strided((192, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_26.run(buf65, primals_320, buf66, buf67, buf69, 192, 4, grid=grid(192), stream=stream0)
del buf65
del primals_320
buf70 = empty_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda', dtype=torch.float16)
buf1154 = empty_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda', dtype=torch.bool)
triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_27.run(buf61, buf64, buf66, primals_9, primals_10, buf70, buf1154, 77070336, grid=grid(77070336), stream=stream0)
del buf66
del primals_10
buf71 = empty_strided((20, 96, 1, 1), (96, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_28.run(primals_125, buf71, 1920, grid=grid(1920), stream=stream0)
del primals_125
buf72 = extern_kernels.convolution(as_strided(buf70, (128, 96, 56, 56), (602112, 3136, 56, 1)), buf71, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf72, (128, 20, 56, 56), (62720, 3136, 56, 1))
buf73 = empty_strided((20, 96, 1, 1), (96, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_28.run(primals_126, buf73, 1920, grid=grid(1920), stream=stream0)
del primals_126
buf74 = extern_kernels.convolution(as_strided(buf70, (128, 96, 56, 56), (602112, 3136, 56, 1), 301056), buf73, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf74, (128, 20, 56, 56), (62720, 3136, 56, 1))
buf77 = empty_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda', dtype=torch.float16)
buf75 = as_strided(buf77, (128, 20, 56, 56), (125440, 3136, 56, 1)) # alias
triton_poi_fused_cat_29.run(buf72, buf75, 8028160, grid=grid(8028160), stream=stream0)
del buf72
buf76 = as_strided(buf77, (128, 20, 56, 56), (125440, 3136, 56, 1), 62720) # alias
triton_poi_fused_cat_29.run(buf74, buf76, 8028160, grid=grid(8028160), stream=stream0)
del buf74
buf78 = empty_strided((1, 40, 1, 1, 13), (520, 1, 520, 520, 40), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_30.run(buf77, buf78, 520, 30878, grid=grid(520), stream=stream0)
buf79 = empty_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda', dtype=torch.float32)
buf80 = buf79; del buf79 # reuse
buf84 = empty_strided((40, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_31.run(buf80, buf78, primals_322, buf84, 40, 13, grid=grid(40), stream=stream0)
del primals_322
buf81 = buf78; del buf78 # reuse
triton_red_fused__native_batch_norm_legit_functional_32.run(buf77, buf80, buf81, 520, 30878, grid=grid(520), stream=stream0)
buf82 = empty_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda', dtype=torch.float32)
buf83 = empty_strided((40, ), (1, ), device='cuda', dtype=torch.float32)
buf85 = empty_strided((40, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_33.run(buf81, primals_323, buf82, buf83, buf85, 40, 13, grid=grid(40), stream=stream0)
del primals_323
buf86 = empty_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_34.run(buf77, buf80, buf82, primals_11, primals_12, buf86, 16056320, grid=grid(16056320), stream=stream0)
del primals_12
buf87 = empty_strided((60, 20, 1, 1), (20, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_35.run(primals_127, buf87, 1200, grid=grid(1200), stream=stream0)
del primals_127
buf88 = extern_kernels.convolution(as_strided(buf86, (128, 20, 56, 56), (125440, 3136, 56, 1)), buf87, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf88, (128, 60, 56, 56), (188160, 3136, 56, 1))
buf89 = empty_strided((60, 20, 1, 1), (20, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_35.run(primals_128, buf89, 1200, grid=grid(1200), stream=stream0)
del primals_128
buf90 = extern_kernels.convolution(as_strided(buf86, (128, 20, 56, 56), (125440, 3136, 56, 1), 62720), buf89, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf90, (128, 60, 56, 56), (188160, 3136, 56, 1))
buf93 = empty_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda', dtype=torch.float16)
buf91 = as_strided(buf93, (128, 60, 56, 56), (376320, 3136, 56, 1)) # alias
triton_poi_fused_cat_36.run(buf88, buf91, 24084480, grid=grid(24084480), stream=stream0)
buf92 = as_strided(buf93, (128, 60, 56, 56), (376320, 3136, 56, 1), 188160) # alias
triton_poi_fused_cat_36.run(buf90, buf92, 24084480, grid=grid(24084480), stream=stream0)
buf94 = empty_strided((1, 120, 1, 1, 4), (480, 1, 480, 480, 120), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_37.run(buf93, buf94, 480, 100352, grid=grid(480), stream=stream0)
buf95 = empty_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda', dtype=torch.float32)
buf96 = buf95; del buf95 # reuse
buf100 = empty_strided((120, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_38.run(buf96, buf94, primals_325, buf100, 120, 4, grid=grid(120), stream=stream0)
del primals_325
buf97 = buf94; del buf94 # reuse
triton_red_fused__native_batch_norm_legit_functional_39.run(buf93, buf96, buf97, 480, 100352, grid=grid(480), stream=stream0)
buf98 = empty_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda', dtype=torch.float32)
buf99 = empty_strided((120, ), (1, ), device='cuda', dtype=torch.float32)
buf101 = empty_strided((120, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_40.run(buf97, primals_326, buf98, buf99, buf101, 120, 4, grid=grid(120), stream=stream0)
del primals_326
buf102 = empty_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_relu_41.run(buf93, buf96, buf98, primals_13, primals_14, buf102, 48168960, grid=grid(48168960), stream=stream0)
del primals_14
buf103 = empty_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_42.run(primals_129, buf103, 1080, grid=grid(1080), stream=stream0)
del primals_129
buf104 = extern_kernels.convolution(buf102, buf103, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None)
assert_size_stride(buf104, (128, 120, 56, 56), (376320, 3136, 56, 1))
buf105 = buf97; del buf97 # reuse
triton_red_fused__native_batch_norm_legit_functional_37.run(buf104, buf105, 480, 100352, grid=grid(480), stream=stream0)
buf106 = buf98; del buf98 # reuse
buf107 = buf106; del buf106 # reuse
buf111 = empty_strided((120, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_38.run(buf107, buf105, primals_328, buf111, 120, 4, grid=grid(120), stream=stream0)
del primals_328
buf108 = buf105; del buf105 # reuse
triton_red_fused__native_batch_norm_legit_functional_39.run(buf104, buf107, buf108, 480, 100352, grid=grid(480), stream=stream0)
buf109 = empty_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda', dtype=torch.float32)
buf110 = empty_strided((120, ), (1, ), device='cuda', dtype=torch.float32)
buf112 = empty_strided((120, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_40.run(buf108, primals_329, buf109, buf110, buf112, 120, 4, grid=grid(120), stream=stream0)
del primals_329
buf113 = empty_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda', dtype=torch.float16)
buf1153 = empty_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda', dtype=torch.bool)
triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_43.run(buf104, buf107, buf109, primals_15, primals_16, buf113, buf1153, 48168960, grid=grid(48168960), stream=stream0)
del buf109
del primals_16
buf114 = empty_strided((20, 60, 1, 1), (60, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_44.run(primals_130, buf114, 1200, grid=grid(1200), stream=stream0)
del primals_130
buf115 = extern_kernels.convolution(as_strided(buf113, (128, 60, 56, 56), (376320, 3136, 56, 1)), buf114, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf115, (128, 20, 56, 56), (62720, 3136, 56, 1))
buf116 = empty_strided((20, 60, 1, 1), (60, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_44.run(primals_131, buf116, 1200, grid=grid(1200), stream=stream0)
del primals_131
buf117 = extern_kernels.convolution(as_strided(buf113, (128, 60, 56, 56), (376320, 3136, 56, 1), 188160), buf116, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf117, (128, 20, 56, 56), (62720, 3136, 56, 1))
buf120 = empty_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda', dtype=torch.float16)
buf118 = as_strided(buf120, (128, 20, 56, 56), (125440, 3136, 56, 1)) # alias
triton_poi_fused_cat_29.run(buf115, buf118, 8028160, grid=grid(8028160), stream=stream0)
del buf115
buf119 = as_strided(buf120, (128, 20, 56, 56), (125440, 3136, 56, 1), 62720) # alias
triton_poi_fused_cat_29.run(buf117, buf119, 8028160, grid=grid(8028160), stream=stream0)
del buf117
buf121 = buf81; del buf81 # reuse
triton_red_fused__native_batch_norm_legit_functional_30.run(buf120, buf121, 520, 30878, grid=grid(520), stream=stream0)
buf122 = buf82; del buf82 # reuse
buf123 = buf122; del buf122 # reuse
buf127 = empty_strided((40, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_31.run(buf123, buf121, primals_331, buf127, 40, 13, grid=grid(40), stream=stream0)
del primals_331
buf124 = buf121; del buf121 # reuse
triton_red_fused__native_batch_norm_legit_functional_32.run(buf120, buf123, buf124, 520, 30878, grid=grid(520), stream=stream0)
buf125 = empty_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda', dtype=torch.float32)
buf126 = empty_strided((40, ), (1, ), device='cuda', dtype=torch.float32)
buf128 = empty_strided((40, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_33.run(buf124, primals_332, buf125, buf126, buf128, 40, 13, grid=grid(40), stream=stream0)
del buf124
del primals_332
buf129 = empty_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_45.run(buf120, buf123, buf125, primals_17, primals_18, buf86, buf129, 16056320, grid=grid(16056320), stream=stream0)
del buf125
del primals_18
buf130 = empty_strided((240, 40, 1, 1), (40, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_46.run(primals_132, buf130, 9600, grid=grid(9600), stream=stream0)
del primals_132
buf131 = extern_kernels.convolution(buf129, buf130, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf131, (128, 240, 56, 56), (752640, 3136, 56, 1))
buf132 = empty_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda', dtype=torch.float32)
buf133 = buf132; del buf132 # reuse
buf136 = empty_strided((240, ), (1, ), device='cuda', dtype=torch.float32)
buf134 = empty_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda', dtype=torch.float32)
buf135 = empty_strided((240, ), (1, ), device='cuda', dtype=torch.float32)
buf137 = empty_strided((240, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_47.run(buf133, buf131, primals_334, primals_335, buf136, buf134, buf135, buf137, 240, 401408, grid=grid(240), stream=stream0)
del primals_334
del primals_335
buf138 = empty_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda', dtype=torch.float16)
buf1152 = empty_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_48.run(buf131, buf133, buf134, primals_19, primals_20, buf138, buf1152, 96337920, grid=grid(96337920), stream=stream0)
del primals_20
buf139 = empty_strided((60, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_49.run(primals_133, buf139, 540, grid=grid(540), stream=stream0)
del primals_133
buf140 = buf90; del buf90 # reuse
triton_poi_fused_split_with_sizes_50.run(buf138, buf140, 24084480, grid=grid(24084480), stream=stream0)
buf141 = extern_kernels.convolution(buf140, buf139, stride=(2, 2), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=60, bias=None)
assert_size_stride(buf141, (128, 60, 28, 28), (47040, 784, 28, 1))
buf142 = empty_strided((60, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_51.run(primals_134, buf142, 1500, grid=grid(1500), stream=stream0)
del primals_134
buf143 = buf88; del buf88 # reuse
triton_poi_fused_split_with_sizes_52.run(buf138, buf143, 24084480, grid=grid(24084480), stream=stream0)
buf144 = extern_kernels.convolution(buf143, buf142, stride=(2, 2), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=60, bias=None)
assert_size_stride(buf144, (128, 60, 28, 28), (47040, 784, 28, 1))
buf145 = empty_strided((60, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_53.run(primals_135, buf145, 2940, grid=grid(2940), stream=stream0)
del primals_135
buf146 = empty_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_split_with_sizes_54.run(buf138, buf146, 24084480, grid=grid(24084480), stream=stream0)
buf147 = extern_kernels.convolution(buf146, buf145, stride=(2, 2), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=60, bias=None)
assert_size_stride(buf147, (128, 60, 28, 28), (47040, 784, 28, 1))
buf148 = empty_strided((60, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_55.run(primals_136, buf148, 4860, grid=grid(4860), stream=stream0)
del primals_136
buf149 = empty_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_split_with_sizes_56.run(buf138, buf149, 24084480, grid=grid(24084480), stream=stream0)
del buf138
buf150 = extern_kernels.convolution(buf149, buf148, stride=(2, 2), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=60, bias=None)
assert_size_stride(buf150, (128, 60, 28, 28), (47040, 784, 28, 1))
buf155 = empty_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda', dtype=torch.float16)
buf151 = as_strided(buf155, (128, 60, 28, 28), (188160, 784, 28, 1)) # alias
triton_poi_fused_cat_57.run(buf141, buf151, 6021120, grid=grid(6021120), stream=stream0)
buf152 = as_strided(buf155, (128, 60, 28, 28), (188160, 784, 28, 1), 47040) # alias
triton_poi_fused_cat_57.run(buf144, buf152, 6021120, grid=grid(6021120), stream=stream0)
buf153 = as_strided(buf155, (128, 60, 28, 28), (188160, 784, 28, 1), 94080) # alias
triton_poi_fused_cat_57.run(buf147, buf153, 6021120, grid=grid(6021120), stream=stream0)
buf154 = as_strided(buf155, (128, 60, 28, 28), (188160, 784, 28, 1), 141120) # alias
triton_poi_fused_cat_57.run(buf150, buf154, 6021120, grid=grid(6021120), stream=stream0)
buf156 = buf134; del buf134 # reuse
buf157 = buf156; del buf156 # reuse
buf160 = empty_strided((240, ), (1, ), device='cuda', dtype=torch.float32)
buf158 = empty_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda', dtype=torch.float32)
buf159 = empty_strided((240, ), (1, ), device='cuda', dtype=torch.float32)
buf161 = empty_strided((240, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_58.run(buf157, buf155, primals_337, primals_338, buf160, buf158, buf159, buf161, 240, 100352, grid=grid(240), stream=stream0)
del primals_337
del primals_338
buf162 = empty_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda', dtype=torch.float16)
buf164 = empty_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_per_fused__native_batch_norm_legit_functional_mean_silu_59.run(buf155, buf157, buf158, primals_21, primals_22, buf162, buf164, 30720, 784, grid=grid(30720), stream=stream0)
del buf158
del primals_22
buf165 = empty_strided((20, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_60.run(primals_137, buf165, 4800, grid=grid(4800), stream=stream0)
del primals_137
buf166 = empty_strided((20, ), (1, ), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_convolution_61.run(primals_138, buf166, 20, grid=grid(20), stream=stream0)
del primals_138
buf167 = extern_kernels.convolution(buf164, buf165, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf167, (128, 20, 1, 1), (20, 1, 1, 1))
buf168 = buf167; del buf167 # reuse
buf169 = empty_strided((128, 20, 1, 1), (20, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_convolution_silu_62.run(buf168, buf166, buf169, 2560, grid=grid(2560), stream=stream0)
del buf166
buf170 = empty_strided((240, 20, 1, 1), (20, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_63.run(primals_139, buf170, 4800, grid=grid(4800), stream=stream0)
del primals_139
buf171 = empty_strided((240, ), (1, ), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_convolution_64.run(primals_140, buf171, 240, grid=grid(240), stream=stream0)
del primals_140
buf172 = extern_kernels.convolution(buf169, buf170, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf172, (128, 240, 1, 1), (240, 1, 1, 1))
buf173 = buf172; del buf172 # reuse
triton_poi_fused__to_copy_convolution_65.run(buf173, buf171, 30720, grid=grid(30720), stream=stream0)
del buf171
buf174 = empty_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_mul_sigmoid_silu_66.run(buf162, buf173, buf174, 24084480, grid=grid(24084480), stream=stream0)
buf175 = empty_strided((56, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_67.run(primals_141, buf175, 13440, grid=grid(13440), stream=stream0)
del primals_141
buf176 = extern_kernels.convolution(buf174, buf175, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf176, (128, 56, 28, 28), (43904, 784, 28, 1))
buf177 = empty_strided((1, 56, 1, 1, 13), (728, 1, 728, 728, 56), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_68.run(buf176, buf177, 728, 7720, grid=grid(728), stream=stream0)
buf178 = empty_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda', dtype=torch.float32)
buf179 = buf178; del buf178 # reuse
buf183 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_69.run(buf179, buf177, primals_340, buf183, 56, 13, grid=grid(56), stream=stream0)
del primals_340
buf180 = buf177; del buf177 # reuse
triton_red_fused__native_batch_norm_legit_functional_70.run(buf176, buf179, buf180, 728, 7720, grid=grid(728), stream=stream0)
buf181 = empty_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda', dtype=torch.float32)
buf182 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32)
buf184 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_71.run(buf180, primals_341, buf181, buf182, buf184, 56, 13, grid=grid(56), stream=stream0)
del primals_341
buf185 = empty_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_72.run(buf176, buf179, buf181, primals_23, primals_24, buf185, 5619712, grid=grid(5619712), stream=stream0)
del primals_24
buf186 = empty_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_73.run(primals_142, buf186, 4704, grid=grid(4704), stream=stream0)
del primals_142
buf187 = extern_kernels.convolution(as_strided(buf185, (128, 28, 28, 28), (43904, 784, 28, 1)), buf186, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf187, (128, 168, 28, 28), (131712, 784, 28, 1))
buf188 = empty_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_73.run(primals_143, buf188, 4704, grid=grid(4704), stream=stream0)
del primals_143
buf189 = extern_kernels.convolution(as_strided(buf185, (128, 28, 28, 28), (43904, 784, 28, 1), 21952), buf188, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf189, (128, 168, 28, 28), (131712, 784, 28, 1))
buf192 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16)
buf190 = as_strided(buf192, (128, 168, 28, 28), (263424, 784, 28, 1)) # alias
triton_poi_fused_cat_74.run(buf187, buf190, 16859136, grid=grid(16859136), stream=stream0)
buf191 = as_strided(buf192, (128, 168, 28, 28), (263424, 784, 28, 1), 131712) # alias
triton_poi_fused_cat_74.run(buf189, buf191, 16859136, grid=grid(16859136), stream=stream0)
buf193 = empty_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda', dtype=torch.float32)
buf194 = buf193; del buf193 # reuse
buf197 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
buf195 = empty_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda', dtype=torch.float32)
buf196 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
buf198 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_75.run(buf194, buf192, primals_343, primals_344, buf197, buf195, buf196, buf198, 336, 100352, grid=grid(336), stream=stream0)
del primals_343
del primals_344
buf199 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16)
buf1150 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76.run(buf192, buf194, buf195, primals_25, primals_26, buf199, buf1150, 33718272, grid=grid(33718272), stream=stream0)
del primals_26
buf200 = empty_strided((168, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_77.run(primals_144, buf200, 1512, grid=grid(1512), stream=stream0)
del primals_144
buf201 = buf189; del buf189 # reuse
triton_poi_fused_split_with_sizes_78.run(buf199, buf201, 16859136, grid=grid(16859136), stream=stream0)
buf202 = extern_kernels.convolution(buf201, buf200, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=168, bias=None)
assert_size_stride(buf202, (128, 168, 28, 28), (131712, 784, 28, 1))
buf203 = empty_strided((168, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_79.run(primals_145, buf203, 4200, grid=grid(4200), stream=stream0)
del primals_145
buf204 = buf187; del buf187 # reuse
triton_poi_fused_split_with_sizes_80.run(buf199, buf204, 16859136, grid=grid(16859136), stream=stream0)
buf205 = extern_kernels.convolution(buf204, buf203, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=168, bias=None)
assert_size_stride(buf205, (128, 168, 28, 28), (131712, 784, 28, 1))
buf208 = buf199; del buf199 # reuse
buf206 = as_strided(buf208, (128, 168, 28, 28), (263424, 784, 28, 1)) # alias
triton_poi_fused_cat_74.run(buf202, buf206, 16859136, grid=grid(16859136), stream=stream0)
buf207 = as_strided(buf208, (128, 168, 28, 28), (263424, 784, 28, 1), 131712) # alias
triton_poi_fused_cat_74.run(buf205, buf207, 16859136, grid=grid(16859136), stream=stream0)
buf209 = buf195; del buf195 # reuse
buf210 = buf209; del buf209 # reuse
buf213 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
buf211 = empty_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda', dtype=torch.float32)
buf212 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
buf214 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_75.run(buf210, buf208, primals_346, primals_347, buf213, buf211, buf212, buf214, 336, 100352, grid=grid(336), stream=stream0)
del primals_346
del primals_347
buf215 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16)
buf217 = empty_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_per_fused__native_batch_norm_legit_functional_mean_silu_81.run(buf208, buf210, buf211, primals_27, primals_28, buf215, buf217, 43008, 784, grid=grid(43008), stream=stream0)
del primals_28
buf218 = empty_strided((28, 336, 1, 1), (336, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_82.run(primals_146, buf218, 9408, grid=grid(9408), stream=stream0)
del primals_146
buf219 = empty_strided((28, ), (1, ), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_convolution_83.run(primals_147, buf219, 28, grid=grid(28), stream=stream0)
del primals_147
buf220 = extern_kernels.convolution(buf217, buf218, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf220, (128, 28, 1, 1), (28, 1, 1, 1))
buf221 = buf220; del buf220 # reuse
buf222 = empty_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_convolution_silu_84.run(buf221, buf219, buf222, 3584, grid=grid(3584), stream=stream0)
buf223 = empty_strided((336, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_85.run(primals_148, buf223, 9408, grid=grid(9408), stream=stream0)
del primals_148
buf224 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_convolution_86.run(primals_149, buf224, 336, grid=grid(336), stream=stream0)
del primals_149
buf225 = extern_kernels.convolution(buf222, buf223, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf225, (128, 336, 1, 1), (336, 1, 1, 1))
buf226 = buf225; del buf225 # reuse
triton_poi_fused__to_copy_convolution_87.run(buf226, buf224, 43008, grid=grid(43008), stream=stream0)
buf227 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_mul_sigmoid_silu_88.run(buf215, buf226, buf227, 33718272, grid=grid(33718272), stream=stream0)
buf228 = buf205; del buf205 # reuse
triton_poi_fused_split_with_sizes_89.run(buf227, buf228, 16859136, grid=grid(16859136), stream=stream0)
buf229 = buf202; del buf202 # reuse
triton_poi_fused_split_with_sizes_90.run(buf227, buf229, 16859136, grid=grid(16859136), stream=stream0)
buf230 = empty_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_91.run(primals_150, buf230, 4704, grid=grid(4704), stream=stream0)
del primals_150
buf231 = extern_kernels.convolution(buf228, buf230, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf231, (128, 28, 28, 28), (21952, 784, 28, 1))
buf232 = empty_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_91.run(primals_151, buf232, 4704, grid=grid(4704), stream=stream0)
del primals_151
buf233 = extern_kernels.convolution(buf229, buf232, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf233, (128, 28, 28, 28), (21952, 784, 28, 1))
buf236 = empty_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda', dtype=torch.float16)
buf234 = as_strided(buf236, (128, 28, 28, 28), (43904, 784, 28, 1)) # alias
triton_poi_fused_cat_92.run(buf231, buf234, 2809856, grid=grid(2809856), stream=stream0)
del buf231
buf235 = as_strided(buf236, (128, 28, 28, 28), (43904, 784, 28, 1), 21952) # alias
triton_poi_fused_cat_92.run(buf233, buf235, 2809856, grid=grid(2809856), stream=stream0)
del buf233
buf237 = buf180; del buf180 # reuse
triton_red_fused__native_batch_norm_legit_functional_68.run(buf236, buf237, 728, 7720, grid=grid(728), stream=stream0)
buf238 = buf181; del buf181 # reuse
buf239 = buf238; del buf238 # reuse
buf243 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_69.run(buf239, buf237, primals_349, buf243, 56, 13, grid=grid(56), stream=stream0)
del primals_349
buf240 = buf237; del buf237 # reuse
triton_red_fused__native_batch_norm_legit_functional_70.run(buf236, buf239, buf240, 728, 7720, grid=grid(728), stream=stream0)
buf241 = empty_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda', dtype=torch.float32)
buf242 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32)
buf244 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_71.run(buf240, primals_350, buf241, buf242, buf244, 56, 13, grid=grid(56), stream=stream0)
del primals_350
buf245 = empty_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_93.run(buf236, buf239, buf241, primals_29, primals_30, buf185, buf245, 5619712, grid=grid(5619712), stream=stream0)
del primals_30
buf246 = empty_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_73.run(primals_152, buf246, 4704, grid=grid(4704), stream=stream0)
del primals_152
buf247 = extern_kernels.convolution(as_strided(buf245, (128, 28, 28, 28), (43904, 784, 28, 1)), buf246, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf247, (128, 168, 28, 28), (131712, 784, 28, 1))
buf248 = empty_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_73.run(primals_153, buf248, 4704, grid=grid(4704), stream=stream0)
del primals_153
buf249 = extern_kernels.convolution(as_strided(buf245, (128, 28, 28, 28), (43904, 784, 28, 1), 21952), buf248, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf249, (128, 168, 28, 28), (131712, 784, 28, 1))
buf252 = buf227; del buf227 # reuse
buf250 = as_strided(buf252, (128, 168, 28, 28), (263424, 784, 28, 1)) # alias
triton_poi_fused_cat_74.run(buf247, buf250, 16859136, grid=grid(16859136), stream=stream0)
buf251 = as_strided(buf252, (128, 168, 28, 28), (263424, 784, 28, 1), 131712) # alias
triton_poi_fused_cat_74.run(buf249, buf251, 16859136, grid=grid(16859136), stream=stream0)
buf253 = buf211; del buf211 # reuse
buf254 = buf253; del buf253 # reuse
buf257 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
buf255 = empty_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda', dtype=torch.float32)
buf256 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
buf258 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_75.run(buf254, buf252, primals_352, primals_353, buf257, buf255, buf256, buf258, 336, 100352, grid=grid(336), stream=stream0)
del primals_352
del primals_353
buf259 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16)
buf1148 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76.run(buf252, buf254, buf255, primals_31, primals_32, buf259, buf1148, 33718272, grid=grid(33718272), stream=stream0)
del primals_32
buf260 = empty_strided((168, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_77.run(primals_154, buf260, 1512, grid=grid(1512), stream=stream0)
del primals_154
buf261 = buf249; del buf249 # reuse
triton_poi_fused_split_with_sizes_78.run(buf259, buf261, 16859136, grid=grid(16859136), stream=stream0)
buf262 = extern_kernels.convolution(buf261, buf260, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=168, bias=None)
assert_size_stride(buf262, (128, 168, 28, 28), (131712, 784, 28, 1))
buf263 = empty_strided((168, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_79.run(primals_155, buf263, 4200, grid=grid(4200), stream=stream0)
del primals_155
buf264 = buf247; del buf247 # reuse
triton_poi_fused_split_with_sizes_80.run(buf259, buf264, 16859136, grid=grid(16859136), stream=stream0)
buf265 = extern_kernels.convolution(buf264, buf263, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=168, bias=None)
assert_size_stride(buf265, (128, 168, 28, 28), (131712, 784, 28, 1))
buf268 = buf259; del buf259 # reuse
buf266 = as_strided(buf268, (128, 168, 28, 28), (263424, 784, 28, 1)) # alias
triton_poi_fused_cat_74.run(buf262, buf266, 16859136, grid=grid(16859136), stream=stream0)
buf267 = as_strided(buf268, (128, 168, 28, 28), (263424, 784, 28, 1), 131712) # alias
triton_poi_fused_cat_74.run(buf265, buf267, 16859136, grid=grid(16859136), stream=stream0)
buf269 = buf255; del buf255 # reuse
buf270 = buf269; del buf269 # reuse
buf273 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
buf271 = empty_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda', dtype=torch.float32)
buf272 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
buf274 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_75.run(buf270, buf268, primals_355, primals_356, buf273, buf271, buf272, buf274, 336, 100352, grid=grid(336), stream=stream0)
del primals_355
del primals_356
buf275 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16)
buf277 = empty_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_per_fused__native_batch_norm_legit_functional_mean_silu_81.run(buf268, buf270, buf271, primals_33, primals_34, buf275, buf277, 43008, 784, grid=grid(43008), stream=stream0)
del primals_34
buf278 = empty_strided((28, 336, 1, 1), (336, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_82.run(primals_156, buf278, 9408, grid=grid(9408), stream=stream0)
del primals_156
buf279 = buf219; del buf219 # reuse
triton_poi_fused__to_copy_convolution_83.run(primals_157, buf279, 28, grid=grid(28), stream=stream0)
del primals_157
buf280 = extern_kernels.convolution(buf277, buf278, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf280, (128, 28, 1, 1), (28, 1, 1, 1))
buf281 = buf280; del buf280 # reuse
buf282 = empty_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_convolution_silu_84.run(buf281, buf279, buf282, 3584, grid=grid(3584), stream=stream0)
buf283 = empty_strided((336, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_85.run(primals_158, buf283, 9408, grid=grid(9408), stream=stream0)
del primals_158
buf284 = buf224; del buf224 # reuse
triton_poi_fused__to_copy_convolution_86.run(primals_159, buf284, 336, grid=grid(336), stream=stream0)
del primals_159
buf285 = extern_kernels.convolution(buf282, buf283, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf285, (128, 336, 1, 1), (336, 1, 1, 1))
buf286 = buf285; del buf285 # reuse
triton_poi_fused__to_copy_convolution_87.run(buf286, buf284, 43008, grid=grid(43008), stream=stream0)
buf287 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_mul_sigmoid_silu_88.run(buf275, buf286, buf287, 33718272, grid=grid(33718272), stream=stream0)
buf288 = buf265; del buf265 # reuse
triton_poi_fused_split_with_sizes_89.run(buf287, buf288, 16859136, grid=grid(16859136), stream=stream0)
buf289 = buf262; del buf262 # reuse
triton_poi_fused_split_with_sizes_90.run(buf287, buf289, 16859136, grid=grid(16859136), stream=stream0)
buf290 = empty_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_91.run(primals_160, buf290, 4704, grid=grid(4704), stream=stream0)
del primals_160
buf291 = extern_kernels.convolution(buf288, buf290, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf291, (128, 28, 28, 28), (21952, 784, 28, 1))
buf292 = empty_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_91.run(primals_161, buf292, 4704, grid=grid(4704), stream=stream0)
del primals_161
buf293 = extern_kernels.convolution(buf289, buf292, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf293, (128, 28, 28, 28), (21952, 784, 28, 1))
buf296 = empty_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda', dtype=torch.float16)
buf294 = as_strided(buf296, (128, 28, 28, 28), (43904, 784, 28, 1)) # alias
triton_poi_fused_cat_92.run(buf291, buf294, 2809856, grid=grid(2809856), stream=stream0)
del buf291
buf295 = as_strided(buf296, (128, 28, 28, 28), (43904, 784, 28, 1), 21952) # alias
triton_poi_fused_cat_92.run(buf293, buf295, 2809856, grid=grid(2809856), stream=stream0)
del buf293
buf297 = buf240; del buf240 # reuse
triton_red_fused__native_batch_norm_legit_functional_68.run(buf296, buf297, 728, 7720, grid=grid(728), stream=stream0)
buf298 = buf241; del buf241 # reuse
buf299 = buf298; del buf298 # reuse
buf303 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_69.run(buf299, buf297, primals_358, buf303, 56, 13, grid=grid(56), stream=stream0)
del primals_358
buf300 = buf297; del buf297 # reuse
triton_red_fused__native_batch_norm_legit_functional_70.run(buf296, buf299, buf300, 728, 7720, grid=grid(728), stream=stream0)
buf301 = empty_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda', dtype=torch.float32)
buf302 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32)
buf304 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_71.run(buf300, primals_359, buf301, buf302, buf304, 56, 13, grid=grid(56), stream=stream0)
del primals_359
buf305 = empty_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_93.run(buf296, buf299, buf301, primals_35, primals_36, buf245, buf305, 5619712, grid=grid(5619712), stream=stream0)
del primals_36
buf306 = empty_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_73.run(primals_162, buf306, 4704, grid=grid(4704), stream=stream0)
del primals_162
buf307 = extern_kernels.convolution(as_strided(buf305, (128, 28, 28, 28), (43904, 784, 28, 1)), buf306, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf307, (128, 168, 28, 28), (131712, 784, 28, 1))
buf308 = empty_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_73.run(primals_163, buf308, 4704, grid=grid(4704), stream=stream0)
del primals_163
buf309 = extern_kernels.convolution(as_strided(buf305, (128, 28, 28, 28), (43904, 784, 28, 1), 21952), buf308, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf309, (128, 168, 28, 28), (131712, 784, 28, 1))
buf312 = buf287; del buf287 # reuse
buf310 = as_strided(buf312, (128, 168, 28, 28), (263424, 784, 28, 1)) # alias
triton_poi_fused_cat_74.run(buf307, buf310, 16859136, grid=grid(16859136), stream=stream0)
buf311 = as_strided(buf312, (128, 168, 28, 28), (263424, 784, 28, 1), 131712) # alias
triton_poi_fused_cat_74.run(buf309, buf311, 16859136, grid=grid(16859136), stream=stream0)
buf313 = buf271; del buf271 # reuse
buf314 = buf313; del buf313 # reuse
buf317 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
buf315 = empty_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda', dtype=torch.float32)
buf316 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
buf318 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_75.run(buf314, buf312, primals_361, primals_362, buf317, buf315, buf316, buf318, 336, 100352, grid=grid(336), stream=stream0)
del primals_361
del primals_362
buf319 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16)
buf1146 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76.run(buf312, buf314, buf315, primals_37, primals_38, buf319, buf1146, 33718272, grid=grid(33718272), stream=stream0)
del primals_38
buf320 = empty_strided((168, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_77.run(primals_164, buf320, 1512, grid=grid(1512), stream=stream0)
del primals_164
buf321 = buf309; del buf309 # reuse
triton_poi_fused_split_with_sizes_78.run(buf319, buf321, 16859136, grid=grid(16859136), stream=stream0)
buf322 = extern_kernels.convolution(buf321, buf320, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=168, bias=None)
assert_size_stride(buf322, (128, 168, 28, 28), (131712, 784, 28, 1))
buf323 = empty_strided((168, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_79.run(primals_165, buf323, 4200, grid=grid(4200), stream=stream0)
del primals_165
buf324 = buf307; del buf307 # reuse
triton_poi_fused_split_with_sizes_80.run(buf319, buf324, 16859136, grid=grid(16859136), stream=stream0)
buf325 = extern_kernels.convolution(buf324, buf323, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=168, bias=None)
assert_size_stride(buf325, (128, 168, 28, 28), (131712, 784, 28, 1))
buf328 = buf319; del buf319 # reuse
buf326 = as_strided(buf328, (128, 168, 28, 28), (263424, 784, 28, 1)) # alias
triton_poi_fused_cat_74.run(buf322, buf326, 16859136, grid=grid(16859136), stream=stream0)
buf327 = as_strided(buf328, (128, 168, 28, 28), (263424, 784, 28, 1), 131712) # alias
triton_poi_fused_cat_74.run(buf325, buf327, 16859136, grid=grid(16859136), stream=stream0)
buf329 = buf315; del buf315 # reuse
buf330 = buf329; del buf329 # reuse
buf333 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
buf331 = empty_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda', dtype=torch.float32)
buf332 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
buf334 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_75.run(buf330, buf328, primals_364, primals_365, buf333, buf331, buf332, buf334, 336, 100352, grid=grid(336), stream=stream0)
del primals_364
del primals_365
buf335 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16)
buf337 = empty_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_per_fused__native_batch_norm_legit_functional_mean_silu_81.run(buf328, buf330, buf331, primals_39, primals_40, buf335, buf337, 43008, 784, grid=grid(43008), stream=stream0)
del primals_40
buf338 = empty_strided((28, 336, 1, 1), (336, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_82.run(primals_166, buf338, 9408, grid=grid(9408), stream=stream0)
del primals_166
buf339 = buf279; del buf279 # reuse
triton_poi_fused__to_copy_convolution_83.run(primals_167, buf339, 28, grid=grid(28), stream=stream0)
del primals_167
buf340 = extern_kernels.convolution(buf337, buf338, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf340, (128, 28, 1, 1), (28, 1, 1, 1))
buf341 = buf340; del buf340 # reuse
buf342 = empty_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_convolution_silu_84.run(buf341, buf339, buf342, 3584, grid=grid(3584), stream=stream0)
del buf339
buf343 = empty_strided((336, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_85.run(primals_168, buf343, 9408, grid=grid(9408), stream=stream0)
del primals_168
buf344 = buf284; del buf284 # reuse
triton_poi_fused__to_copy_convolution_86.run(primals_169, buf344, 336, grid=grid(336), stream=stream0)
del primals_169
buf345 = extern_kernels.convolution(buf342, buf343, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf345, (128, 336, 1, 1), (336, 1, 1, 1))
buf346 = buf345; del buf345 # reuse
triton_poi_fused__to_copy_convolution_87.run(buf346, buf344, 43008, grid=grid(43008), stream=stream0)
buf347 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_mul_sigmoid_silu_88.run(buf335, buf346, buf347, 33718272, grid=grid(33718272), stream=stream0)
buf348 = buf325; del buf325 # reuse
triton_poi_fused_split_with_sizes_89.run(buf347, buf348, 16859136, grid=grid(16859136), stream=stream0)
buf349 = buf322; del buf322 # reuse
triton_poi_fused_split_with_sizes_90.run(buf347, buf349, 16859136, grid=grid(16859136), stream=stream0)
buf350 = empty_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_91.run(primals_170, buf350, 4704, grid=grid(4704), stream=stream0)
del primals_170
buf351 = extern_kernels.convolution(buf348, buf350, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf351, (128, 28, 28, 28), (21952, 784, 28, 1))
buf352 = empty_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_91.run(primals_171, buf352, 4704, grid=grid(4704), stream=stream0)
del primals_171
buf353 = extern_kernels.convolution(buf349, buf352, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf353, (128, 28, 28, 28), (21952, 784, 28, 1))
buf356 = empty_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda', dtype=torch.float16)
buf354 = as_strided(buf356, (128, 28, 28, 28), (43904, 784, 28, 1)) # alias
triton_poi_fused_cat_92.run(buf351, buf354, 2809856, grid=grid(2809856), stream=stream0)
del buf351
buf355 = as_strided(buf356, (128, 28, 28, 28), (43904, 784, 28, 1), 21952) # alias
triton_poi_fused_cat_92.run(buf353, buf355, 2809856, grid=grid(2809856), stream=stream0)
del buf353
buf357 = buf300; del buf300 # reuse
triton_red_fused__native_batch_norm_legit_functional_68.run(buf356, buf357, 728, 7720, grid=grid(728), stream=stream0)
buf358 = buf301; del buf301 # reuse
buf359 = buf358; del buf358 # reuse
buf363 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_69.run(buf359, buf357, primals_367, buf363, 56, 13, grid=grid(56), stream=stream0)
del primals_367
buf360 = buf357; del buf357 # reuse
triton_red_fused__native_batch_norm_legit_functional_70.run(buf356, buf359, buf360, 728, 7720, grid=grid(728), stream=stream0)
buf361 = empty_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda', dtype=torch.float32)
buf362 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32)
buf364 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_71.run(buf360, primals_368, buf361, buf362, buf364, 56, 13, grid=grid(56), stream=stream0)
del buf360
del primals_368
buf365 = empty_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_93.run(buf356, buf359, buf361, primals_41, primals_42, buf305, buf365, 5619712, grid=grid(5619712), stream=stream0)
del buf361
del primals_42
buf366 = empty_strided((336, 56, 1, 1), (56, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_94.run(primals_172, buf366, 18816, grid=grid(18816), stream=stream0)
del primals_172
buf367 = extern_kernels.convolution(buf365, buf366, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf367, (128, 336, 28, 28), (263424, 784, 28, 1))
buf368 = buf331; del buf331 # reuse
buf369 = buf368; del buf368 # reuse
buf372 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
buf370 = empty_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda', dtype=torch.float32)
buf371 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
buf373 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_75.run(buf369, buf367, primals_370, primals_371, buf372, buf370, buf371, buf373, 336, 100352, grid=grid(336), stream=stream0)
del primals_370
del primals_371
buf374 = buf347; del buf347 # reuse
buf1144 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76.run(buf367, buf369, buf370, primals_43, primals_44, buf374, buf1144, 33718272, grid=grid(33718272), stream=stream0)
del primals_44
buf375 = empty_strided((112, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_95.run(primals_173, buf375, 1008, grid=grid(1008), stream=stream0)
del primals_173
buf376 = empty_strided((128, 112, 28, 28), (87808, 784, 28, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_split_with_sizes_96.run(buf374, buf376, 11239424, grid=grid(11239424), stream=stream0)
buf377 = extern_kernels.convolution(buf376, buf375, stride=(2, 2), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=112, bias=None)
assert_size_stride(buf377, (128, 112, 14, 14), (21952, 196, 14, 1))
buf378 = empty_strided((112, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_97.run(primals_174, buf378, 2800, grid=grid(2800), stream=stream0)
del primals_174
buf379 = empty_strided((128, 112, 28, 28), (87808, 784, 28, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_split_with_sizes_98.run(buf374, buf379, 11239424, grid=grid(11239424), stream=stream0)
buf380 = extern_kernels.convolution(buf379, buf378, stride=(2, 2), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=112, bias=None)
assert_size_stride(buf380, (128, 112, 14, 14), (21952, 196, 14, 1))
buf381 = empty_strided((112, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_99.run(primals_175, buf381, 5488, grid=grid(5488), stream=stream0)
del primals_175
buf382 = empty_strided((128, 112, 28, 28), (87808, 784, 28, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_split_with_sizes_100.run(buf374, buf382, 11239424, grid=grid(11239424), stream=stream0)
del buf374
buf383 = extern_kernels.convolution(buf382, buf381, stride=(2, 2), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=112, bias=None)
assert_size_stride(buf383, (128, 112, 14, 14), (21952, 196, 14, 1))
buf387 = empty_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda', dtype=torch.float16)
buf384 = as_strided(buf387, (128, 112, 14, 14), (65856, 196, 14, 1)) # alias
triton_poi_fused_cat_101.run(buf377, buf384, 2809856, grid=grid(2809856), stream=stream0)
del buf377
buf385 = as_strided(buf387, (128, 112, 14, 14), (65856, 196, 14, 1), 21952) # alias
triton_poi_fused_cat_101.run(buf380, buf385, 2809856, grid=grid(2809856), stream=stream0)
del buf380
buf386 = as_strided(buf387, (128, 112, 14, 14), (65856, 196, 14, 1), 43904) # alias
triton_poi_fused_cat_101.run(buf383, buf386, 2809856, grid=grid(2809856), stream=stream0)
del buf383
buf388 = buf370; del buf370 # reuse
buf389 = buf388; del buf388 # reuse
buf392 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
buf390 = empty_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda', dtype=torch.float32)
buf391 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
buf393 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_102.run(buf389, buf387, primals_373, primals_374, buf392, buf390, buf391, buf393, 336, 25088, grid=grid(336), stream=stream0)
del primals_373
del primals_374
buf394 = empty_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda', dtype=torch.float16)
buf396 = empty_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_per_fused__native_batch_norm_legit_functional_mean_silu_103.run(buf387, buf389, buf390, primals_45, primals_46, buf394, buf396, 43008, 196, grid=grid(43008), stream=stream0)
del buf390
del primals_46
buf397 = empty_strided((14, 336, 1, 1), (336, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_104.run(primals_176, buf397, 4704, grid=grid(4704), stream=stream0)
del primals_176
buf398 = empty_strided((14, ), (1, ), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_convolution_105.run(primals_177, buf398, 14, grid=grid(14), stream=stream0)
del primals_177
buf399 = extern_kernels.convolution(buf396, buf397, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf399, (128, 14, 1, 1), (14, 1, 1, 1))
buf400 = buf399; del buf399 # reuse
buf401 = empty_strided((128, 14, 1, 1), (14, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_convolution_silu_106.run(buf400, buf398, buf401, 1792, grid=grid(1792), stream=stream0)
del buf398
buf402 = empty_strided((336, 14, 1, 1), (14, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_107.run(primals_178, buf402, 4704, grid=grid(4704), stream=stream0)
del primals_178
buf403 = buf344; del buf344 # reuse
triton_poi_fused__to_copy_convolution_86.run(primals_179, buf403, 336, grid=grid(336), stream=stream0)
del primals_179
buf404 = extern_kernels.convolution(buf401, buf402, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf404, (128, 336, 1, 1), (336, 1, 1, 1))
buf405 = buf404; del buf404 # reuse
triton_poi_fused__to_copy_convolution_87.run(buf405, buf403, 43008, grid=grid(43008), stream=stream0)
del buf403
buf406 = empty_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_mul_sigmoid_silu_108.run(buf394, buf405, buf406, 8429568, grid=grid(8429568), stream=stream0)
buf407 = empty_strided((104, 336, 1, 1), (336, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_109.run(primals_180, buf407, 34944, grid=grid(34944), stream=stream0)
del primals_180
buf408 = extern_kernels.convolution(buf406, buf407, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf408, (128, 104, 14, 14), (20384, 196, 14, 1))
buf409 = empty_strided((1, 104, 1, 1, 4), (416, 1, 416, 416, 104), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_110.run(buf408, buf409, 416, 6272, grid=grid(416), stream=stream0)
buf410 = empty_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda', dtype=torch.float32)
buf411 = buf410; del buf410 # reuse
buf415 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_111.run(buf411, buf409, primals_376, buf415, 104, 4, grid=grid(104), stream=stream0)
del primals_376
buf412 = buf409; del buf409 # reuse
triton_red_fused__native_batch_norm_legit_functional_112.run(buf408, buf411, buf412, 416, 6272, grid=grid(416), stream=stream0)
buf413 = empty_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda', dtype=torch.float32)
buf414 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32)
buf416 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_113.run(buf412, primals_377, buf413, buf414, buf416, 104, 4, grid=grid(104), stream=stream0)
del primals_377
buf417 = empty_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_114.run(buf408, buf411, buf413, primals_47, primals_48, buf417, 2609152, grid=grid(2609152), stream=stream0)
del primals_48
buf418 = empty_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_115.run(primals_181, buf418, 16224, grid=grid(16224), stream=stream0)
del primals_181
buf419 = extern_kernels.convolution(as_strided(buf417, (128, 52, 14, 14), (20384, 196, 14, 1)), buf418, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf419, (128, 312, 14, 14), (61152, 196, 14, 1))
buf420 = empty_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_115.run(primals_182, buf420, 16224, grid=grid(16224), stream=stream0)
del primals_182
buf421 = extern_kernels.convolution(as_strided(buf417, (128, 52, 14, 14), (20384, 196, 14, 1), 10192), buf420, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf421, (128, 312, 14, 14), (61152, 196, 14, 1))
buf424 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
buf422 = as_strided(buf424, (128, 312, 14, 14), (122304, 196, 14, 1)) # alias
triton_poi_fused_cat_116.run(buf419, buf422, 7827456, grid=grid(7827456), stream=stream0)
buf423 = as_strided(buf424, (128, 312, 14, 14), (122304, 196, 14, 1), 61152) # alias
triton_poi_fused_cat_116.run(buf421, buf423, 7827456, grid=grid(7827456), stream=stream0)
buf425 = empty_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda', dtype=torch.float32)
buf426 = buf425; del buf425 # reuse
buf429 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
buf427 = empty_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda', dtype=torch.float32)
buf428 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
buf430 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_117.run(buf426, buf424, primals_379, primals_380, buf429, buf427, buf428, buf430, 624, 25088, grid=grid(624), stream=stream0)
del primals_379
del primals_380
buf431 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
buf1142 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_118.run(buf424, buf426, buf427, primals_49, primals_50, buf431, buf1142, 15654912, grid=grid(15654912), stream=stream0)
del primals_50
buf432 = empty_strided((156, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_119.run(primals_183, buf432, 1404, grid=grid(1404), stream=stream0)
del primals_183
buf433 = empty_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_split_with_sizes_120.run(buf431, buf433, 3913728, grid=grid(3913728), stream=stream0)
buf434 = extern_kernels.convolution(buf433, buf432, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None)
assert_size_stride(buf434, (128, 156, 14, 14), (30576, 196, 14, 1))
buf435 = empty_strided((156, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_121.run(primals_184, buf435, 3900, grid=grid(3900), stream=stream0)
del primals_184
buf436 = empty_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_split_with_sizes_122.run(buf431, buf436, 3913728, grid=grid(3913728), stream=stream0)
buf437 = extern_kernels.convolution(buf436, buf435, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None)
assert_size_stride(buf437, (128, 156, 14, 14), (30576, 196, 14, 1))
buf438 = empty_strided((156, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_123.run(primals_185, buf438, 7644, grid=grid(7644), stream=stream0)
del primals_185
buf439 = empty_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_split_with_sizes_124.run(buf431, buf439, 3913728, grid=grid(3913728), stream=stream0)
buf440 = extern_kernels.convolution(buf439, buf438, stride=(1, 1), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None)
assert_size_stride(buf440, (128, 156, 14, 14), (30576, 196, 14, 1))
buf441 = empty_strided((156, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_125.run(primals_186, buf441, 12636, grid=grid(12636), stream=stream0)
del primals_186
buf442 = empty_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_split_with_sizes_126.run(buf431, buf442, 3913728, grid=grid(3913728), stream=stream0)
buf443 = extern_kernels.convolution(buf442, buf441, stride=(1, 1), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None)
assert_size_stride(buf443, (128, 156, 14, 14), (30576, 196, 14, 1))
buf448 = buf431; del buf431 # reuse
buf444 = as_strided(buf448, (128, 156, 14, 14), (122304, 196, 14, 1)) # alias
triton_poi_fused_cat_127.run(buf434, buf444, 3913728, grid=grid(3913728), stream=stream0)
buf445 = as_strided(buf448, (128, 156, 14, 14), (122304, 196, 14, 1), 30576) # alias
triton_poi_fused_cat_127.run(buf437, buf445, 3913728, grid=grid(3913728), stream=stream0)
buf446 = as_strided(buf448, (128, 156, 14, 14), (122304, 196, 14, 1), 61152) # alias
triton_poi_fused_cat_127.run(buf440, buf446, 3913728, grid=grid(3913728), stream=stream0)
buf447 = as_strided(buf448, (128, 156, 14, 14), (122304, 196, 14, 1), 91728) # alias
triton_poi_fused_cat_127.run(buf443, buf447, 3913728, grid=grid(3913728), stream=stream0)
buf449 = buf427; del buf427 # reuse
buf450 = buf449; del buf449 # reuse
buf453 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
buf451 = empty_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda', dtype=torch.float32)
buf452 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
buf454 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_117.run(buf450, buf448, primals_382, primals_383, buf453, buf451, buf452, buf454, 624, 25088, grid=grid(624), stream=stream0)
del primals_382
del primals_383
buf455 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
buf457 = empty_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_per_fused__native_batch_norm_legit_functional_mean_silu_128.run(buf448, buf450, buf451, primals_51, primals_52, buf455, buf457, 79872, 196, grid=grid(79872), stream=stream0)
del primals_52
buf458 = empty_strided((26, 624, 1, 1), (624, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_129.run(primals_187, buf458, 16224, grid=grid(16224), stream=stream0)
del primals_187
buf459 = empty_strided((26, ), (1, ), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_convolution_130.run(primals_188, buf459, 26, grid=grid(26), stream=stream0)
del primals_188
buf460 = extern_kernels.convolution(buf457, buf458, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf460, (128, 26, 1, 1), (26, 1, 1, 1))
buf461 = buf460; del buf460 # reuse
buf462 = empty_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_convolution_silu_131.run(buf461, buf459, buf462, 3328, grid=grid(3328), stream=stream0)
buf463 = empty_strided((624, 26, 1, 1), (26, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_132.run(primals_189, buf463, 16224, grid=grid(16224), stream=stream0)
del primals_189
buf464 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_convolution_133.run(primals_190, buf464, 624, grid=grid(624), stream=stream0)
del primals_190
buf465 = extern_kernels.convolution(buf462, buf463, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf465, (128, 624, 1, 1), (624, 1, 1, 1))
buf466 = buf465; del buf465 # reuse
triton_poi_fused__to_copy_convolution_134.run(buf466, buf464, 79872, grid=grid(79872), stream=stream0)
buf467 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_mul_sigmoid_silu_135.run(buf455, buf466, buf467, 15654912, grid=grid(15654912), stream=stream0)
buf468 = buf421; del buf421 # reuse
triton_poi_fused_split_with_sizes_136.run(buf467, buf468, 7827456, grid=grid(7827456), stream=stream0)
buf469 = buf419; del buf419 # reuse
triton_poi_fused_split_with_sizes_137.run(buf467, buf469, 7827456, grid=grid(7827456), stream=stream0)
buf470 = empty_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_138.run(primals_191, buf470, 16224, grid=grid(16224), stream=stream0)
del primals_191
buf471 = extern_kernels.convolution(buf468, buf470, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf471, (128, 52, 14, 14), (10192, 196, 14, 1))
buf472 = empty_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_138.run(primals_192, buf472, 16224, grid=grid(16224), stream=stream0)
del primals_192
buf473 = extern_kernels.convolution(buf469, buf472, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf473, (128, 52, 14, 14), (10192, 196, 14, 1))
buf476 = empty_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda', dtype=torch.float16)
buf474 = as_strided(buf476, (128, 52, 14, 14), (20384, 196, 14, 1)) # alias
triton_poi_fused_cat_139.run(buf471, buf474, 1304576, grid=grid(1304576), stream=stream0)
del buf471
buf475 = as_strided(buf476, (128, 52, 14, 14), (20384, 196, 14, 1), 10192) # alias
triton_poi_fused_cat_139.run(buf473, buf475, 1304576, grid=grid(1304576), stream=stream0)
del buf473
buf477 = buf412; del buf412 # reuse
triton_red_fused__native_batch_norm_legit_functional_110.run(buf476, buf477, 416, 6272, grid=grid(416), stream=stream0)
buf478 = buf413; del buf413 # reuse
buf479 = buf478; del buf478 # reuse
buf483 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_111.run(buf479, buf477, primals_385, buf483, 104, 4, grid=grid(104), stream=stream0)
del primals_385
buf480 = buf477; del buf477 # reuse
triton_red_fused__native_batch_norm_legit_functional_112.run(buf476, buf479, buf480, 416, 6272, grid=grid(416), stream=stream0)
buf481 = empty_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda', dtype=torch.float32)
buf482 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32)
buf484 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_113.run(buf480, primals_386, buf481, buf482, buf484, 104, 4, grid=grid(104), stream=stream0)
del primals_386
buf485 = empty_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_140.run(buf476, buf479, buf481, primals_53, primals_54, buf417, buf485, 2609152, grid=grid(2609152), stream=stream0)
del primals_54
buf486 = empty_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_115.run(primals_193, buf486, 16224, grid=grid(16224), stream=stream0)
del primals_193
buf487 = extern_kernels.convolution(as_strided(buf485, (128, 52, 14, 14), (20384, 196, 14, 1)), buf486, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf487, (128, 312, 14, 14), (61152, 196, 14, 1))
buf488 = empty_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_115.run(primals_194, buf488, 16224, grid=grid(16224), stream=stream0)
del primals_194
buf489 = extern_kernels.convolution(as_strided(buf485, (128, 52, 14, 14), (20384, 196, 14, 1), 10192), buf488, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf489, (128, 312, 14, 14), (61152, 196, 14, 1))
buf492 = buf467; del buf467 # reuse
buf490 = as_strided(buf492, (128, 312, 14, 14), (122304, 196, 14, 1)) # alias
triton_poi_fused_cat_116.run(buf487, buf490, 7827456, grid=grid(7827456), stream=stream0)
buf491 = as_strided(buf492, (128, 312, 14, 14), (122304, 196, 14, 1), 61152) # alias
triton_poi_fused_cat_116.run(buf489, buf491, 7827456, grid=grid(7827456), stream=stream0)
buf493 = buf451; del buf451 # reuse
buf494 = buf493; del buf493 # reuse
buf497 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
buf495 = empty_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda', dtype=torch.float32)
buf496 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
buf498 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_117.run(buf494, buf492, primals_388, primals_389, buf497, buf495, buf496, buf498, 624, 25088, grid=grid(624), stream=stream0)
del primals_388
del primals_389
buf499 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
buf1140 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_118.run(buf492, buf494, buf495, primals_55, primals_56, buf499, buf1140, 15654912, grid=grid(15654912), stream=stream0)
del primals_56
buf500 = empty_strided((156, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_119.run(primals_195, buf500, 1404, grid=grid(1404), stream=stream0)
del primals_195
buf501 = buf443; del buf443 # reuse
triton_poi_fused_split_with_sizes_120.run(buf499, buf501, 3913728, grid=grid(3913728), stream=stream0)
buf502 = extern_kernels.convolution(buf501, buf500, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None)
assert_size_stride(buf502, (128, 156, 14, 14), (30576, 196, 14, 1))
buf503 = empty_strided((156, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_121.run(primals_196, buf503, 3900, grid=grid(3900), stream=stream0)
del primals_196
buf504 = buf440; del buf440 # reuse
triton_poi_fused_split_with_sizes_122.run(buf499, buf504, 3913728, grid=grid(3913728), stream=stream0)
buf505 = extern_kernels.convolution(buf504, buf503, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None)
assert_size_stride(buf505, (128, 156, 14, 14), (30576, 196, 14, 1))
buf506 = empty_strided((156, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_123.run(primals_197, buf506, 7644, grid=grid(7644), stream=stream0)
del primals_197
buf507 = buf437; del buf437 # reuse
triton_poi_fused_split_with_sizes_124.run(buf499, buf507, 3913728, grid=grid(3913728), stream=stream0)
buf508 = extern_kernels.convolution(buf507, buf506, stride=(1, 1), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None)
assert_size_stride(buf508, (128, 156, 14, 14), (30576, 196, 14, 1))
buf509 = empty_strided((156, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_125.run(primals_198, buf509, 12636, grid=grid(12636), stream=stream0)
del primals_198
buf510 = buf434; del buf434 # reuse
triton_poi_fused_split_with_sizes_126.run(buf499, buf510, 3913728, grid=grid(3913728), stream=stream0)
buf511 = extern_kernels.convolution(buf510, buf509, stride=(1, 1), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None)
assert_size_stride(buf511, (128, 156, 14, 14), (30576, 196, 14, 1))
buf516 = buf499; del buf499 # reuse
buf512 = as_strided(buf516, (128, 156, 14, 14), (122304, 196, 14, 1)) # alias
triton_poi_fused_cat_127.run(buf502, buf512, 3913728, grid=grid(3913728), stream=stream0)
buf513 = as_strided(buf516, (128, 156, 14, 14), (122304, 196, 14, 1), 30576) # alias
triton_poi_fused_cat_127.run(buf505, buf513, 3913728, grid=grid(3913728), stream=stream0)
buf514 = as_strided(buf516, (128, 156, 14, 14), (122304, 196, 14, 1), 61152) # alias
triton_poi_fused_cat_127.run(buf508, buf514, 3913728, grid=grid(3913728), stream=stream0)
buf515 = as_strided(buf516, (128, 156, 14, 14), (122304, 196, 14, 1), 91728) # alias
triton_poi_fused_cat_127.run(buf511, buf515, 3913728, grid=grid(3913728), stream=stream0)
buf517 = buf495; del buf495 # reuse
buf518 = buf517; del buf517 # reuse
buf521 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
buf519 = empty_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda', dtype=torch.float32)
buf520 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
buf522 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_117.run(buf518, buf516, primals_391, primals_392, buf521, buf519, buf520, buf522, 624, 25088, grid=grid(624), stream=stream0)
del primals_391
del primals_392
buf523 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
buf525 = empty_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_per_fused__native_batch_norm_legit_functional_mean_silu_128.run(buf516, buf518, buf519, primals_57, primals_58, buf523, buf525, 79872, 196, grid=grid(79872), stream=stream0)
del primals_58
buf526 = empty_strided((26, 624, 1, 1), (624, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_129.run(primals_199, buf526, 16224, grid=grid(16224), stream=stream0)
del primals_199
buf527 = buf459; del buf459 # reuse
triton_poi_fused__to_copy_convolution_130.run(primals_200, buf527, 26, grid=grid(26), stream=stream0)
del primals_200
buf528 = extern_kernels.convolution(buf525, buf526, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf528, (128, 26, 1, 1), (26, 1, 1, 1))
buf529 = buf528; del buf528 # reuse
buf530 = empty_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_convolution_silu_131.run(buf529, buf527, buf530, 3328, grid=grid(3328), stream=stream0)
buf531 = empty_strided((624, 26, 1, 1), (26, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_132.run(primals_201, buf531, 16224, grid=grid(16224), stream=stream0)
del primals_201
buf532 = buf464; del buf464 # reuse
triton_poi_fused__to_copy_convolution_133.run(primals_202, buf532, 624, grid=grid(624), stream=stream0)
del primals_202
buf533 = extern_kernels.convolution(buf530, buf531, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf533, (128, 624, 1, 1), (624, 1, 1, 1))
buf534 = buf533; del buf533 # reuse
triton_poi_fused__to_copy_convolution_134.run(buf534, buf532, 79872, grid=grid(79872), stream=stream0)
buf535 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_mul_sigmoid_silu_135.run(buf523, buf534, buf535, 15654912, grid=grid(15654912), stream=stream0)
buf536 = buf489; del buf489 # reuse
triton_poi_fused_split_with_sizes_136.run(buf535, buf536, 7827456, grid=grid(7827456), stream=stream0)
buf537 = buf487; del buf487 # reuse
triton_poi_fused_split_with_sizes_137.run(buf535, buf537, 7827456, grid=grid(7827456), stream=stream0)
buf538 = empty_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_138.run(primals_203, buf538, 16224, grid=grid(16224), stream=stream0)
del primals_203
buf539 = extern_kernels.convolution(buf536, buf538, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf539, (128, 52, 14, 14), (10192, 196, 14, 1))
buf540 = empty_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_138.run(primals_204, buf540, 16224, grid=grid(16224), stream=stream0)
del primals_204
buf541 = extern_kernels.convolution(buf537, buf540, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf541, (128, 52, 14, 14), (10192, 196, 14, 1))
buf544 = empty_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda', dtype=torch.float16)
buf542 = as_strided(buf544, (128, 52, 14, 14), (20384, 196, 14, 1)) # alias
triton_poi_fused_cat_139.run(buf539, buf542, 1304576, grid=grid(1304576), stream=stream0)
del buf539
buf543 = as_strided(buf544, (128, 52, 14, 14), (20384, 196, 14, 1), 10192) # alias
triton_poi_fused_cat_139.run(buf541, buf543, 1304576, grid=grid(1304576), stream=stream0)
del buf541
buf545 = buf480; del buf480 # reuse
triton_red_fused__native_batch_norm_legit_functional_110.run(buf544, buf545, 416, 6272, grid=grid(416), stream=stream0)
buf546 = buf481; del buf481 # reuse
buf547 = buf546; del buf546 # reuse
buf551 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_111.run(buf547, buf545, primals_394, buf551, 104, 4, grid=grid(104), stream=stream0)
del primals_394
buf548 = buf545; del buf545 # reuse
triton_red_fused__native_batch_norm_legit_functional_112.run(buf544, buf547, buf548, 416, 6272, grid=grid(416), stream=stream0)
buf549 = empty_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda', dtype=torch.float32)
buf550 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32)
buf552 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_113.run(buf548, primals_395, buf549, buf550, buf552, 104, 4, grid=grid(104), stream=stream0)
del primals_395
buf553 = empty_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_140.run(buf544, buf547, buf549, primals_59, primals_60, buf485, buf553, 2609152, grid=grid(2609152), stream=stream0)
del primals_60
buf554 = empty_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_115.run(primals_205, buf554, 16224, grid=grid(16224), stream=stream0)
del primals_205
buf555 = extern_kernels.convolution(as_strided(buf553, (128, 52, 14, 14), (20384, 196, 14, 1)), buf554, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf555, (128, 312, 14, 14), (61152, 196, 14, 1))
buf556 = empty_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_115.run(primals_206, buf556, 16224, grid=grid(16224), stream=stream0)
del primals_206
buf557 = extern_kernels.convolution(as_strided(buf553, (128, 52, 14, 14), (20384, 196, 14, 1), 10192), buf556, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf557, (128, 312, 14, 14), (61152, 196, 14, 1))
buf560 = buf535; del buf535 # reuse
buf558 = as_strided(buf560, (128, 312, 14, 14), (122304, 196, 14, 1)) # alias
triton_poi_fused_cat_116.run(buf555, buf558, 7827456, grid=grid(7827456), stream=stream0)
buf559 = as_strided(buf560, (128, 312, 14, 14), (122304, 196, 14, 1), 61152) # alias
triton_poi_fused_cat_116.run(buf557, buf559, 7827456, grid=grid(7827456), stream=stream0)
buf561 = buf519; del buf519 # reuse
buf562 = buf561; del buf561 # reuse
buf565 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
buf563 = empty_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda', dtype=torch.float32)
buf564 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
buf566 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_117.run(buf562, buf560, primals_397, primals_398, buf565, buf563, buf564, buf566, 624, 25088, grid=grid(624), stream=stream0)
del primals_397
del primals_398
buf567 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
buf1138 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_118.run(buf560, buf562, buf563, primals_61, primals_62, buf567, buf1138, 15654912, grid=grid(15654912), stream=stream0)
del primals_62
buf568 = empty_strided((156, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_119.run(primals_207, buf568, 1404, grid=grid(1404), stream=stream0)
del primals_207
buf569 = buf511; del buf511 # reuse
triton_poi_fused_split_with_sizes_120.run(buf567, buf569, 3913728, grid=grid(3913728), stream=stream0)
buf570 = extern_kernels.convolution(buf569, buf568, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None)
assert_size_stride(buf570, (128, 156, 14, 14), (30576, 196, 14, 1))
buf571 = empty_strided((156, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_121.run(primals_208, buf571, 3900, grid=grid(3900), stream=stream0)
del primals_208
buf572 = buf508; del buf508 # reuse
triton_poi_fused_split_with_sizes_122.run(buf567, buf572, 3913728, grid=grid(3913728), stream=stream0)
buf573 = extern_kernels.convolution(buf572, buf571, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None)
assert_size_stride(buf573, (128, 156, 14, 14), (30576, 196, 14, 1))
buf574 = empty_strided((156, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_123.run(primals_209, buf574, 7644, grid=grid(7644), stream=stream0)
del primals_209
buf575 = buf505; del buf505 # reuse
triton_poi_fused_split_with_sizes_124.run(buf567, buf575, 3913728, grid=grid(3913728), stream=stream0)
buf576 = extern_kernels.convolution(buf575, buf574, stride=(1, 1), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None)
assert_size_stride(buf576, (128, 156, 14, 14), (30576, 196, 14, 1))
buf577 = empty_strided((156, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_125.run(primals_210, buf577, 12636, grid=grid(12636), stream=stream0)
del primals_210
buf578 = buf502; del buf502 # reuse
triton_poi_fused_split_with_sizes_126.run(buf567, buf578, 3913728, grid=grid(3913728), stream=stream0)
buf579 = extern_kernels.convolution(buf578, buf577, stride=(1, 1), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None)
assert_size_stride(buf579, (128, 156, 14, 14), (30576, 196, 14, 1))
buf584 = buf567; del buf567 # reuse
buf580 = as_strided(buf584, (128, 156, 14, 14), (122304, 196, 14, 1)) # alias
triton_poi_fused_cat_127.run(buf570, buf580, 3913728, grid=grid(3913728), stream=stream0)
del buf570
buf581 = as_strided(buf584, (128, 156, 14, 14), (122304, 196, 14, 1), 30576) # alias
triton_poi_fused_cat_127.run(buf573, buf581, 3913728, grid=grid(3913728), stream=stream0)
del buf573
buf582 = as_strided(buf584, (128, 156, 14, 14), (122304, 196, 14, 1), 61152) # alias
triton_poi_fused_cat_127.run(buf576, buf582, 3913728, grid=grid(3913728), stream=stream0)
del buf576
buf583 = as_strided(buf584, (128, 156, 14, 14), (122304, 196, 14, 1), 91728) # alias
triton_poi_fused_cat_127.run(buf579, buf583, 3913728, grid=grid(3913728), stream=stream0)
del buf579
buf585 = buf563; del buf563 # reuse
buf586 = buf585; del buf585 # reuse
buf589 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
buf587 = empty_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda', dtype=torch.float32)
buf588 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
buf590 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_117.run(buf586, buf584, primals_400, primals_401, buf589, buf587, buf588, buf590, 624, 25088, grid=grid(624), stream=stream0)
del primals_400
del primals_401
buf591 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
buf593 = empty_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_per_fused__native_batch_norm_legit_functional_mean_silu_128.run(buf584, buf586, buf587, primals_63, primals_64, buf591, buf593, 79872, 196, grid=grid(79872), stream=stream0)
del primals_64
buf594 = empty_strided((26, 624, 1, 1), (624, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_129.run(primals_211, buf594, 16224, grid=grid(16224), stream=stream0)
del primals_211
buf595 = buf527; del buf527 # reuse
triton_poi_fused__to_copy_convolution_130.run(primals_212, buf595, 26, grid=grid(26), stream=stream0)
del primals_212
buf596 = extern_kernels.convolution(buf593, buf594, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf596, (128, 26, 1, 1), (26, 1, 1, 1))
buf597 = buf596; del buf596 # reuse
buf598 = empty_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_convolution_silu_131.run(buf597, buf595, buf598, 3328, grid=grid(3328), stream=stream0)
del buf595
buf599 = empty_strided((624, 26, 1, 1), (26, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_132.run(primals_213, buf599, 16224, grid=grid(16224), stream=stream0)
del primals_213
buf600 = buf532; del buf532 # reuse
triton_poi_fused__to_copy_convolution_133.run(primals_214, buf600, 624, grid=grid(624), stream=stream0)
del primals_214
buf601 = extern_kernels.convolution(buf598, buf599, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf601, (128, 624, 1, 1), (624, 1, 1, 1))
buf602 = buf601; del buf601 # reuse
triton_poi_fused__to_copy_convolution_134.run(buf602, buf600, 79872, grid=grid(79872), stream=stream0)
buf603 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_mul_sigmoid_silu_135.run(buf591, buf602, buf603, 15654912, grid=grid(15654912), stream=stream0)
buf604 = buf557; del buf557 # reuse
triton_poi_fused_split_with_sizes_136.run(buf603, buf604, 7827456, grid=grid(7827456), stream=stream0)
buf605 = buf555; del buf555 # reuse
triton_poi_fused_split_with_sizes_137.run(buf603, buf605, 7827456, grid=grid(7827456), stream=stream0)
buf606 = empty_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_138.run(primals_215, buf606, 16224, grid=grid(16224), stream=stream0)
del primals_215
buf607 = extern_kernels.convolution(buf604, buf606, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf607, (128, 52, 14, 14), (10192, 196, 14, 1))
buf608 = empty_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_138.run(primals_216, buf608, 16224, grid=grid(16224), stream=stream0)
del primals_216
buf609 = extern_kernels.convolution(buf605, buf608, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf609, (128, 52, 14, 14), (10192, 196, 14, 1))
buf612 = empty_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda', dtype=torch.float16)
buf610 = as_strided(buf612, (128, 52, 14, 14), (20384, 196, 14, 1)) # alias
triton_poi_fused_cat_139.run(buf607, buf610, 1304576, grid=grid(1304576), stream=stream0)
del buf607
buf611 = as_strided(buf612, (128, 52, 14, 14), (20384, 196, 14, 1), 10192) # alias
triton_poi_fused_cat_139.run(buf609, buf611, 1304576, grid=grid(1304576), stream=stream0)
del buf609
buf613 = buf548; del buf548 # reuse
triton_red_fused__native_batch_norm_legit_functional_110.run(buf612, buf613, 416, 6272, grid=grid(416), stream=stream0)
buf614 = buf549; del buf549 # reuse
buf615 = buf614; del buf614 # reuse
buf619 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_111.run(buf615, buf613, primals_403, buf619, 104, 4, grid=grid(104), stream=stream0)
del primals_403
buf616 = buf613; del buf613 # reuse
triton_red_fused__native_batch_norm_legit_functional_112.run(buf612, buf615, buf616, 416, 6272, grid=grid(416), stream=stream0)
buf617 = empty_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda', dtype=torch.float32)
buf618 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32)
buf620 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_113.run(buf616, primals_404, buf617, buf618, buf620, 104, 4, grid=grid(104), stream=stream0)
del buf616
del primals_404
buf621 = empty_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_140.run(buf612, buf615, buf617, primals_65, primals_66, buf553, buf621, 2609152, grid=grid(2609152), stream=stream0)
del buf617
del primals_66
buf622 = empty_strided((624, 104, 1, 1), (104, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_141.run(primals_217, buf622, 64896, grid=grid(64896), stream=stream0)
del primals_217
buf623 = extern_kernels.convolution(buf621, buf622, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf623, (128, 624, 14, 14), (122304, 196, 14, 1))
buf624 = buf587; del buf587 # reuse
buf625 = buf624; del buf624 # reuse
buf628 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
buf626 = empty_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda', dtype=torch.float32)
buf627 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
buf629 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_117.run(buf625, buf623, primals_406, primals_407, buf628, buf626, buf627, buf629, 624, 25088, grid=grid(624), stream=stream0)
del primals_406
del primals_407
buf631 = buf603; del buf603 # reuse
buf1136 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_silu_sub_142.run(buf623, buf625, buf626, primals_67, primals_68, buf631, buf1136, 15654912, grid=grid(15654912), stream=stream0)
del primals_68
buf632 = empty_strided((624, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_143.run(primals_218, buf632, 5616, grid=grid(5616), stream=stream0)
del primals_218
buf633 = extern_kernels.convolution(buf631, buf632, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=624, bias=None)
assert_size_stride(buf633, (128, 624, 14, 14), (122304, 196, 14, 1))
buf634 = buf626; del buf626 # reuse
buf635 = buf634; del buf634 # reuse
buf638 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
buf636 = empty_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda', dtype=torch.float32)
buf637 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
buf639 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_117.run(buf635, buf633, primals_409, primals_410, buf638, buf636, buf637, buf639, 624, 25088, grid=grid(624), stream=stream0)
del primals_409
del primals_410
buf640 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
buf642 = empty_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_per_fused__native_batch_norm_legit_functional_mean_silu_128.run(buf633, buf635, buf636, primals_69, primals_70, buf640, buf642, 79872, 196, grid=grid(79872), stream=stream0)
del buf636
del primals_70
buf643 = empty_strided((52, 624, 1, 1), (624, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_144.run(primals_219, buf643, 32448, grid=grid(32448), stream=stream0)
del primals_219
buf644 = empty_strided((52, ), (1, ), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_convolution_145.run(primals_220, buf644, 52, grid=grid(52), stream=stream0)
del primals_220
buf645 = extern_kernels.convolution(buf642, buf643, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf645, (128, 52, 1, 1), (52, 1, 1, 1))
buf646 = buf645; del buf645 # reuse
buf647 = empty_strided((128, 52, 1, 1), (52, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_convolution_silu_146.run(buf646, buf644, buf647, 6656, grid=grid(6656), stream=stream0)
del buf644
buf648 = empty_strided((624, 52, 1, 1), (52, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_147.run(primals_221, buf648, 32448, grid=grid(32448), stream=stream0)
del primals_221
buf649 = buf600; del buf600 # reuse
triton_poi_fused__to_copy_convolution_133.run(primals_222, buf649, 624, grid=grid(624), stream=stream0)
del primals_222
buf650 = extern_kernels.convolution(buf647, buf648, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf650, (128, 624, 1, 1), (624, 1, 1, 1))
buf651 = buf650; del buf650 # reuse
triton_poi_fused__to_copy_convolution_134.run(buf651, buf649, 79872, grid=grid(79872), stream=stream0)
del buf649
buf652 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_mul_sigmoid_silu_135.run(buf640, buf651, buf652, 15654912, grid=grid(15654912), stream=stream0)
buf653 = empty_strided((160, 624, 1, 1), (624, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_148.run(primals_223, buf653, 99840, grid=grid(99840), stream=stream0)
del primals_223
buf654 = extern_kernels.convolution(buf652, buf653, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf654, (128, 160, 14, 14), (31360, 196, 14, 1))
buf655 = empty_strided((1, 160, 1, 1, 4), (640, 1, 640, 640, 160), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_149.run(buf654, buf655, 640, 6272, grid=grid(640), stream=stream0)
buf656 = empty_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda', dtype=torch.float32)
buf657 = buf656; del buf656 # reuse
buf661 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_150.run(buf657, buf655, primals_412, buf661, 160, 4, grid=grid(160), stream=stream0)
del primals_412
buf658 = buf655; del buf655 # reuse
triton_red_fused__native_batch_norm_legit_functional_151.run(buf654, buf657, buf658, 640, 6272, grid=grid(640), stream=stream0)
buf659 = empty_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda', dtype=torch.float32)
buf660 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
buf662 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_152.run(buf658, primals_413, buf659, buf660, buf662, 160, 4, grid=grid(160), stream=stream0)
del primals_413
buf663 = empty_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_153.run(buf654, buf657, buf659, primals_71, primals_72, buf663, 4014080, grid=grid(4014080), stream=stream0)
del primals_72
buf664 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_154.run(primals_224, buf664, 19200, grid=grid(19200), stream=stream0)
del primals_224
buf665 = extern_kernels.convolution(as_strided(buf663, (128, 80, 14, 14), (31360, 196, 14, 1)), buf664, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf665, (128, 240, 14, 14), (47040, 196, 14, 1))
buf666 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_154.run(primals_225, buf666, 19200, grid=grid(19200), stream=stream0)
del primals_225
buf667 = extern_kernels.convolution(as_strided(buf663, (128, 80, 14, 14), (31360, 196, 14, 1), 15680), buf666, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf667, (128, 240, 14, 14), (47040, 196, 14, 1))
buf670 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16)
buf668 = as_strided(buf670, (128, 240, 14, 14), (94080, 196, 14, 1)) # alias
triton_poi_fused_cat_155.run(buf665, buf668, 6021120, grid=grid(6021120), stream=stream0)
buf669 = as_strided(buf670, (128, 240, 14, 14), (94080, 196, 14, 1), 47040) # alias
triton_poi_fused_cat_155.run(buf667, buf669, 6021120, grid=grid(6021120), stream=stream0)
buf671 = as_strided(buf108, (1, 480, 1, 1), (480, 1, 480, 480)); del buf108 # reuse
buf672 = buf671; del buf671 # reuse
buf675 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
buf673 = empty_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda', dtype=torch.float32)
buf674 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
buf676 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_156.run(buf672, buf670, primals_415, primals_416, buf675, buf673, buf674, buf676, 480, 25088, grid=grid(480), stream=stream0)
del primals_415
del primals_416
buf677 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16)
buf1134 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_157.run(buf670, buf672, buf673, primals_73, primals_74, buf677, buf1134, 12042240, grid=grid(12042240), stream=stream0)
del primals_74
buf678 = empty_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_42.run(primals_226, buf678, 1080, grid=grid(1080), stream=stream0)
del primals_226
buf679 = empty_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_split_with_sizes_158.run(buf677, buf679, 3010560, grid=grid(3010560), stream=stream0)
buf680 = extern_kernels.convolution(buf679, buf678, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None)
assert_size_stride(buf680, (128, 120, 14, 14), (23520, 196, 14, 1))
buf681 = empty_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_159.run(primals_227, buf681, 3000, grid=grid(3000), stream=stream0)
del primals_227
buf682 = empty_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_split_with_sizes_160.run(buf677, buf682, 3010560, grid=grid(3010560), stream=stream0)
buf683 = extern_kernels.convolution(buf682, buf681, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None)
assert_size_stride(buf683, (128, 120, 14, 14), (23520, 196, 14, 1))
buf684 = empty_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_161.run(primals_228, buf684, 5880, grid=grid(5880), stream=stream0)
del primals_228
buf685 = empty_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_split_with_sizes_162.run(buf677, buf685, 3010560, grid=grid(3010560), stream=stream0)
buf686 = extern_kernels.convolution(buf685, buf684, stride=(1, 1), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None)
assert_size_stride(buf686, (128, 120, 14, 14), (23520, 196, 14, 1))
buf687 = empty_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_163.run(primals_229, buf687, 9720, grid=grid(9720), stream=stream0)
del primals_229
buf688 = empty_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_split_with_sizes_164.run(buf677, buf688, 3010560, grid=grid(3010560), stream=stream0)
buf689 = extern_kernels.convolution(buf688, buf687, stride=(1, 1), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None)
assert_size_stride(buf689, (128, 120, 14, 14), (23520, 196, 14, 1))
buf694 = buf677; del buf677 # reuse
buf690 = as_strided(buf694, (128, 120, 14, 14), (94080, 196, 14, 1)) # alias
triton_poi_fused_cat_165.run(buf680, buf690, 3010560, grid=grid(3010560), stream=stream0)
buf691 = as_strided(buf694, (128, 120, 14, 14), (94080, 196, 14, 1), 23520) # alias
triton_poi_fused_cat_165.run(buf683, buf691, 3010560, grid=grid(3010560), stream=stream0)
buf692 = as_strided(buf694, (128, 120, 14, 14), (94080, 196, 14, 1), 47040) # alias
triton_poi_fused_cat_165.run(buf686, buf692, 3010560, grid=grid(3010560), stream=stream0)
buf693 = as_strided(buf694, (128, 120, 14, 14), (94080, 196, 14, 1), 70560) # alias
triton_poi_fused_cat_165.run(buf689, buf693, 3010560, grid=grid(3010560), stream=stream0)
buf695 = buf673; del buf673 # reuse
buf696 = buf695; del buf695 # reuse
buf699 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
buf697 = empty_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda', dtype=torch.float32)
buf698 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
buf700 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_156.run(buf696, buf694, primals_418, primals_419, buf699, buf697, buf698, buf700, 480, 25088, grid=grid(480), stream=stream0)
del primals_418
del primals_419
buf701 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16)
buf703 = empty_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_per_fused__native_batch_norm_legit_functional_mean_silu_166.run(buf694, buf696, buf697, primals_75, primals_76, buf701, buf703, 61440, 196, grid=grid(61440), stream=stream0)
del primals_76
buf704 = empty_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_167.run(primals_230, buf704, 38400, grid=grid(38400), stream=stream0)
del primals_230
buf705 = empty_strided((80, ), (1, ), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_convolution_168.run(primals_231, buf705, 80, grid=grid(80), stream=stream0)
del primals_231
buf706 = extern_kernels.convolution(buf703, buf704, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf706, (128, 80, 1, 1), (80, 1, 1, 1))
buf707 = buf706; del buf706 # reuse
buf708 = empty_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_convolution_silu_169.run(buf707, buf705, buf708, 10240, grid=grid(10240), stream=stream0)
buf709 = empty_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_170.run(primals_232, buf709, 38400, grid=grid(38400), stream=stream0)
del primals_232
buf710 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_convolution_171.run(primals_233, buf710, 480, grid=grid(480), stream=stream0)
del primals_233
buf711 = extern_kernels.convolution(buf708, buf709, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf711, (128, 480, 1, 1), (480, 1, 1, 1))
buf712 = buf711; del buf711 # reuse
triton_poi_fused__to_copy_convolution_172.run(buf712, buf710, 61440, grid=grid(61440), stream=stream0)
buf713 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_mul_sigmoid_silu_173.run(buf701, buf712, buf713, 12042240, grid=grid(12042240), stream=stream0)
buf714 = buf667; del buf667 # reuse
triton_poi_fused_split_with_sizes_174.run(buf713, buf714, 6021120, grid=grid(6021120), stream=stream0)
buf715 = buf665; del buf665 # reuse
triton_poi_fused_split_with_sizes_175.run(buf713, buf715, 6021120, grid=grid(6021120), stream=stream0)
buf716 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_176.run(primals_234, buf716, 19200, grid=grid(19200), stream=stream0)
del primals_234
buf717 = extern_kernels.convolution(buf714, buf716, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf717, (128, 80, 14, 14), (15680, 196, 14, 1))
buf718 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_176.run(primals_235, buf718, 19200, grid=grid(19200), stream=stream0)
del primals_235
buf719 = extern_kernels.convolution(buf715, buf718, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf719, (128, 80, 14, 14), (15680, 196, 14, 1))
buf722 = empty_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda', dtype=torch.float16)
buf720 = as_strided(buf722, (128, 80, 14, 14), (31360, 196, 14, 1)) # alias
triton_poi_fused_cat_177.run(buf717, buf720, 2007040, grid=grid(2007040), stream=stream0)
del buf717
buf721 = as_strided(buf722, (128, 80, 14, 14), (31360, 196, 14, 1), 15680) # alias
triton_poi_fused_cat_177.run(buf719, buf721, 2007040, grid=grid(2007040), stream=stream0)
del buf719
buf723 = buf658; del buf658 # reuse
triton_red_fused__native_batch_norm_legit_functional_149.run(buf722, buf723, 640, 6272, grid=grid(640), stream=stream0)
buf724 = buf659; del buf659 # reuse
buf725 = buf724; del buf724 # reuse
buf729 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_150.run(buf725, buf723, primals_421, buf729, 160, 4, grid=grid(160), stream=stream0)
del primals_421
buf726 = buf723; del buf723 # reuse
triton_red_fused__native_batch_norm_legit_functional_151.run(buf722, buf725, buf726, 640, 6272, grid=grid(640), stream=stream0)
buf727 = empty_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda', dtype=torch.float32)
buf728 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
buf730 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_152.run(buf726, primals_422, buf727, buf728, buf730, 160, 4, grid=grid(160), stream=stream0)
del primals_422
buf731 = empty_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_178.run(buf722, buf725, buf727, primals_77, primals_78, buf663, buf731, 4014080, grid=grid(4014080), stream=stream0)
del primals_78
buf732 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_154.run(primals_236, buf732, 19200, grid=grid(19200), stream=stream0)
del primals_236
buf733 = extern_kernels.convolution(as_strided(buf731, (128, 80, 14, 14), (31360, 196, 14, 1)), buf732, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf733, (128, 240, 14, 14), (47040, 196, 14, 1))
buf734 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_154.run(primals_237, buf734, 19200, grid=grid(19200), stream=stream0)
del primals_237
buf735 = extern_kernels.convolution(as_strided(buf731, (128, 80, 14, 14), (31360, 196, 14, 1), 15680), buf734, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf735, (128, 240, 14, 14), (47040, 196, 14, 1))
buf738 = buf713; del buf713 # reuse
buf736 = as_strided(buf738, (128, 240, 14, 14), (94080, 196, 14, 1)) # alias
triton_poi_fused_cat_155.run(buf733, buf736, 6021120, grid=grid(6021120), stream=stream0)
buf737 = as_strided(buf738, (128, 240, 14, 14), (94080, 196, 14, 1), 47040) # alias
triton_poi_fused_cat_155.run(buf735, buf737, 6021120, grid=grid(6021120), stream=stream0)
buf739 = buf697; del buf697 # reuse
buf740 = buf739; del buf739 # reuse
buf743 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
buf741 = empty_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda', dtype=torch.float32)
buf742 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
buf744 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_156.run(buf740, buf738, primals_424, primals_425, buf743, buf741, buf742, buf744, 480, 25088, grid=grid(480), stream=stream0)
del primals_424
del primals_425
buf745 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16)
buf1132 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_157.run(buf738, buf740, buf741, primals_79, primals_80, buf745, buf1132, 12042240, grid=grid(12042240), stream=stream0)
del primals_80
buf746 = empty_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_42.run(primals_238, buf746, 1080, grid=grid(1080), stream=stream0)
del primals_238
buf747 = buf689; del buf689 # reuse
triton_poi_fused_split_with_sizes_158.run(buf745, buf747, 3010560, grid=grid(3010560), stream=stream0)
buf748 = extern_kernels.convolution(buf747, buf746, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None)
assert_size_stride(buf748, (128, 120, 14, 14), (23520, 196, 14, 1))
buf749 = empty_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_159.run(primals_239, buf749, 3000, grid=grid(3000), stream=stream0)
del primals_239
buf750 = buf686; del buf686 # reuse
triton_poi_fused_split_with_sizes_160.run(buf745, buf750, 3010560, grid=grid(3010560), stream=stream0)
buf751 = extern_kernels.convolution(buf750, buf749, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None)
assert_size_stride(buf751, (128, 120, 14, 14), (23520, 196, 14, 1))
buf752 = empty_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_161.run(primals_240, buf752, 5880, grid=grid(5880), stream=stream0)
del primals_240
buf753 = buf683; del buf683 # reuse
triton_poi_fused_split_with_sizes_162.run(buf745, buf753, 3010560, grid=grid(3010560), stream=stream0)
buf754 = extern_kernels.convolution(buf753, buf752, stride=(1, 1), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None)
assert_size_stride(buf754, (128, 120, 14, 14), (23520, 196, 14, 1))
buf755 = empty_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_163.run(primals_241, buf755, 9720, grid=grid(9720), stream=stream0)
del primals_241
buf756 = buf680; del buf680 # reuse
triton_poi_fused_split_with_sizes_164.run(buf745, buf756, 3010560, grid=grid(3010560), stream=stream0)
buf757 = extern_kernels.convolution(buf756, buf755, stride=(1, 1), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None)
assert_size_stride(buf757, (128, 120, 14, 14), (23520, 196, 14, 1))
buf762 = buf745; del buf745 # reuse
buf758 = as_strided(buf762, (128, 120, 14, 14), (94080, 196, 14, 1)) # alias
triton_poi_fused_cat_165.run(buf748, buf758, 3010560, grid=grid(3010560), stream=stream0)
buf759 = as_strided(buf762, (128, 120, 14, 14), (94080, 196, 14, 1), 23520) # alias
triton_poi_fused_cat_165.run(buf751, buf759, 3010560, grid=grid(3010560), stream=stream0)
buf760 = as_strided(buf762, (128, 120, 14, 14), (94080, 196, 14, 1), 47040) # alias
triton_poi_fused_cat_165.run(buf754, buf760, 3010560, grid=grid(3010560), stream=stream0)
buf761 = as_strided(buf762, (128, 120, 14, 14), (94080, 196, 14, 1), 70560) # alias
triton_poi_fused_cat_165.run(buf757, buf761, 3010560, grid=grid(3010560), stream=stream0)
buf763 = buf741; del buf741 # reuse
buf764 = buf763; del buf763 # reuse
buf767 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
buf765 = empty_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda', dtype=torch.float32)
buf766 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
buf768 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_156.run(buf764, buf762, primals_427, primals_428, buf767, buf765, buf766, buf768, 480, 25088, grid=grid(480), stream=stream0)
del primals_427
del primals_428
buf769 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16)
buf771 = empty_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_per_fused__native_batch_norm_legit_functional_mean_silu_166.run(buf762, buf764, buf765, primals_81, primals_82, buf769, buf771, 61440, 196, grid=grid(61440), stream=stream0)
del primals_82
buf772 = empty_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_167.run(primals_242, buf772, 38400, grid=grid(38400), stream=stream0)
del primals_242
buf773 = buf705; del buf705 # reuse
triton_poi_fused__to_copy_convolution_168.run(primals_243, buf773, 80, grid=grid(80), stream=stream0)
del primals_243
buf774 = extern_kernels.convolution(buf771, buf772, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf774, (128, 80, 1, 1), (80, 1, 1, 1))
buf775 = buf774; del buf774 # reuse
buf776 = empty_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_convolution_silu_169.run(buf775, buf773, buf776, 10240, grid=grid(10240), stream=stream0)
buf777 = empty_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_170.run(primals_244, buf777, 38400, grid=grid(38400), stream=stream0)
del primals_244
buf778 = buf710; del buf710 # reuse
triton_poi_fused__to_copy_convolution_171.run(primals_245, buf778, 480, grid=grid(480), stream=stream0)
del primals_245
buf779 = extern_kernels.convolution(buf776, buf777, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf779, (128, 480, 1, 1), (480, 1, 1, 1))
buf780 = buf779; del buf779 # reuse
triton_poi_fused__to_copy_convolution_172.run(buf780, buf778, 61440, grid=grid(61440), stream=stream0)
buf781 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_mul_sigmoid_silu_173.run(buf769, buf780, buf781, 12042240, grid=grid(12042240), stream=stream0)
buf782 = buf735; del buf735 # reuse
triton_poi_fused_split_with_sizes_174.run(buf781, buf782, 6021120, grid=grid(6021120), stream=stream0)
buf783 = buf733; del buf733 # reuse
triton_poi_fused_split_with_sizes_175.run(buf781, buf783, 6021120, grid=grid(6021120), stream=stream0)
buf784 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_176.run(primals_246, buf784, 19200, grid=grid(19200), stream=stream0)
del primals_246
buf785 = extern_kernels.convolution(buf782, buf784, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf785, (128, 80, 14, 14), (15680, 196, 14, 1))
buf786 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_176.run(primals_247, buf786, 19200, grid=grid(19200), stream=stream0)
del primals_247
buf787 = extern_kernels.convolution(buf783, buf786, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf787, (128, 80, 14, 14), (15680, 196, 14, 1))
buf790 = empty_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda', dtype=torch.float16)
buf788 = as_strided(buf790, (128, 80, 14, 14), (31360, 196, 14, 1)) # alias
triton_poi_fused_cat_177.run(buf785, buf788, 2007040, grid=grid(2007040), stream=stream0)
del buf785
buf789 = as_strided(buf790, (128, 80, 14, 14), (31360, 196, 14, 1), 15680) # alias
triton_poi_fused_cat_177.run(buf787, buf789, 2007040, grid=grid(2007040), stream=stream0)
del buf787
buf791 = buf726; del buf726 # reuse
triton_red_fused__native_batch_norm_legit_functional_149.run(buf790, buf791, 640, 6272, grid=grid(640), stream=stream0)
buf792 = buf727; del buf727 # reuse
buf793 = buf792; del buf792 # reuse
buf797 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_150.run(buf793, buf791, primals_430, buf797, 160, 4, grid=grid(160), stream=stream0)
del primals_430
buf794 = buf791; del buf791 # reuse
triton_red_fused__native_batch_norm_legit_functional_151.run(buf790, buf793, buf794, 640, 6272, grid=grid(640), stream=stream0)
buf795 = empty_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda', dtype=torch.float32)
buf796 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
buf798 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_152.run(buf794, primals_431, buf795, buf796, buf798, 160, 4, grid=grid(160), stream=stream0)
del primals_431
buf799 = empty_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_178.run(buf790, buf793, buf795, primals_83, primals_84, buf731, buf799, 4014080, grid=grid(4014080), stream=stream0)
del primals_84
buf800 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_154.run(primals_248, buf800, 19200, grid=grid(19200), stream=stream0)
del primals_248
buf801 = extern_kernels.convolution(as_strided(buf799, (128, 80, 14, 14), (31360, 196, 14, 1)), buf800, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf801, (128, 240, 14, 14), (47040, 196, 14, 1))
buf802 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_154.run(primals_249, buf802, 19200, grid=grid(19200), stream=stream0)
del primals_249
buf803 = extern_kernels.convolution(as_strided(buf799, (128, 80, 14, 14), (31360, 196, 14, 1), 15680), buf802, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf803, (128, 240, 14, 14), (47040, 196, 14, 1))
buf806 = buf781; del buf781 # reuse
buf804 = as_strided(buf806, (128, 240, 14, 14), (94080, 196, 14, 1)) # alias
triton_poi_fused_cat_155.run(buf801, buf804, 6021120, grid=grid(6021120), stream=stream0)
buf805 = as_strided(buf806, (128, 240, 14, 14), (94080, 196, 14, 1), 47040) # alias
triton_poi_fused_cat_155.run(buf803, buf805, 6021120, grid=grid(6021120), stream=stream0)
buf807 = buf765; del buf765 # reuse
buf808 = buf807; del buf807 # reuse
buf811 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
buf809 = empty_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda', dtype=torch.float32)
buf810 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
buf812 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_156.run(buf808, buf806, primals_433, primals_434, buf811, buf809, buf810, buf812, 480, 25088, grid=grid(480), stream=stream0)
del primals_433
del primals_434
buf813 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16)
buf1130 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_157.run(buf806, buf808, buf809, primals_85, primals_86, buf813, buf1130, 12042240, grid=grid(12042240), stream=stream0)
del primals_86
buf814 = empty_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_42.run(primals_250, buf814, 1080, grid=grid(1080), stream=stream0)
del primals_250
buf815 = buf757; del buf757 # reuse
triton_poi_fused_split_with_sizes_158.run(buf813, buf815, 3010560, grid=grid(3010560), stream=stream0)
buf816 = extern_kernels.convolution(buf815, buf814, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None)
assert_size_stride(buf816, (128, 120, 14, 14), (23520, 196, 14, 1))
buf817 = empty_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_159.run(primals_251, buf817, 3000, grid=grid(3000), stream=stream0)
del primals_251
buf818 = buf754; del buf754 # reuse
triton_poi_fused_split_with_sizes_160.run(buf813, buf818, 3010560, grid=grid(3010560), stream=stream0)
buf819 = extern_kernels.convolution(buf818, buf817, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None)
assert_size_stride(buf819, (128, 120, 14, 14), (23520, 196, 14, 1))
buf820 = empty_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_161.run(primals_252, buf820, 5880, grid=grid(5880), stream=stream0)
del primals_252
buf821 = buf751; del buf751 # reuse
triton_poi_fused_split_with_sizes_162.run(buf813, buf821, 3010560, grid=grid(3010560), stream=stream0)
buf822 = extern_kernels.convolution(buf821, buf820, stride=(1, 1), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None)
assert_size_stride(buf822, (128, 120, 14, 14), (23520, 196, 14, 1))
buf823 = empty_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_163.run(primals_253, buf823, 9720, grid=grid(9720), stream=stream0)
del primals_253
buf824 = buf748; del buf748 # reuse
triton_poi_fused_split_with_sizes_164.run(buf813, buf824, 3010560, grid=grid(3010560), stream=stream0)
buf825 = extern_kernels.convolution(buf824, buf823, stride=(1, 1), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None)
assert_size_stride(buf825, (128, 120, 14, 14), (23520, 196, 14, 1))
buf830 = buf813; del buf813 # reuse
buf826 = as_strided(buf830, (128, 120, 14, 14), (94080, 196, 14, 1)) # alias
triton_poi_fused_cat_165.run(buf816, buf826, 3010560, grid=grid(3010560), stream=stream0)
del buf816
buf827 = as_strided(buf830, (128, 120, 14, 14), (94080, 196, 14, 1), 23520) # alias
triton_poi_fused_cat_165.run(buf819, buf827, 3010560, grid=grid(3010560), stream=stream0)
del buf819
buf828 = as_strided(buf830, (128, 120, 14, 14), (94080, 196, 14, 1), 47040) # alias
triton_poi_fused_cat_165.run(buf822, buf828, 3010560, grid=grid(3010560), stream=stream0)
del buf822
buf829 = as_strided(buf830, (128, 120, 14, 14), (94080, 196, 14, 1), 70560) # alias
triton_poi_fused_cat_165.run(buf825, buf829, 3010560, grid=grid(3010560), stream=stream0)
del buf825
buf831 = buf809; del buf809 # reuse
buf832 = buf831; del buf831 # reuse
buf835 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
buf833 = empty_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda', dtype=torch.float32)
buf834 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
buf836 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_156.run(buf832, buf830, primals_436, primals_437, buf835, buf833, buf834, buf836, 480, 25088, grid=grid(480), stream=stream0)
del primals_436
del primals_437
buf837 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16)
buf839 = empty_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_per_fused__native_batch_norm_legit_functional_mean_silu_166.run(buf830, buf832, buf833, primals_87, primals_88, buf837, buf839, 61440, 196, grid=grid(61440), stream=stream0)
del buf833
del primals_88
buf840 = empty_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_167.run(primals_254, buf840, 38400, grid=grid(38400), stream=stream0)
del primals_254
buf841 = buf773; del buf773 # reuse
triton_poi_fused__to_copy_convolution_168.run(primals_255, buf841, 80, grid=grid(80), stream=stream0)
del primals_255
buf842 = extern_kernels.convolution(buf839, buf840, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf842, (128, 80, 1, 1), (80, 1, 1, 1))
buf843 = buf842; del buf842 # reuse
buf844 = empty_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_convolution_silu_169.run(buf843, buf841, buf844, 10240, grid=grid(10240), stream=stream0)
buf845 = empty_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_170.run(primals_256, buf845, 38400, grid=grid(38400), stream=stream0)
del primals_256
buf846 = buf778; del buf778 # reuse
triton_poi_fused__to_copy_convolution_171.run(primals_257, buf846, 480, grid=grid(480), stream=stream0)
del primals_257
buf847 = extern_kernels.convolution(buf844, buf845, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf847, (128, 480, 1, 1), (480, 1, 1, 1))
buf848 = buf847; del buf847 # reuse
triton_poi_fused__to_copy_convolution_172.run(buf848, buf846, 61440, grid=grid(61440), stream=stream0)
del buf846
buf849 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_mul_sigmoid_silu_173.run(buf837, buf848, buf849, 12042240, grid=grid(12042240), stream=stream0)
buf850 = buf803; del buf803 # reuse
triton_poi_fused_split_with_sizes_174.run(buf849, buf850, 6021120, grid=grid(6021120), stream=stream0)
buf851 = buf801; del buf801 # reuse
triton_poi_fused_split_with_sizes_175.run(buf849, buf851, 6021120, grid=grid(6021120), stream=stream0)
del buf849
buf852 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_176.run(primals_258, buf852, 19200, grid=grid(19200), stream=stream0)
del primals_258
buf853 = extern_kernels.convolution(buf850, buf852, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf853, (128, 80, 14, 14), (15680, 196, 14, 1))
buf854 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_176.run(primals_259, buf854, 19200, grid=grid(19200), stream=stream0)
del primals_259
buf855 = extern_kernels.convolution(buf851, buf854, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf855, (128, 80, 14, 14), (15680, 196, 14, 1))
buf858 = empty_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda', dtype=torch.float16)
buf856 = as_strided(buf858, (128, 80, 14, 14), (31360, 196, 14, 1)) # alias
triton_poi_fused_cat_177.run(buf853, buf856, 2007040, grid=grid(2007040), stream=stream0)
del buf853
buf857 = as_strided(buf858, (128, 80, 14, 14), (31360, 196, 14, 1), 15680) # alias
triton_poi_fused_cat_177.run(buf855, buf857, 2007040, grid=grid(2007040), stream=stream0)
del buf855
buf859 = buf794; del buf794 # reuse
triton_red_fused__native_batch_norm_legit_functional_149.run(buf858, buf859, 640, 6272, grid=grid(640), stream=stream0)
buf860 = buf795; del buf795 # reuse
buf861 = buf860; del buf860 # reuse
buf865 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_150.run(buf861, buf859, primals_439, buf865, 160, 4, grid=grid(160), stream=stream0)
del primals_439
buf862 = buf859; del buf859 # reuse
triton_red_fused__native_batch_norm_legit_functional_151.run(buf858, buf861, buf862, 640, 6272, grid=grid(640), stream=stream0)
buf863 = empty_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda', dtype=torch.float32)
buf864 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
buf866 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32)
triton_per_fused__native_batch_norm_legit_functional_152.run(buf862, primals_440, buf863, buf864, buf866, 160, 4, grid=grid(160), stream=stream0)
del buf862
del primals_440
buf867 = empty_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_178.run(buf858, buf861, buf863, primals_89, primals_90, buf799, buf867, 4014080, grid=grid(4014080), stream=stream0)
del buf863
del primals_90
buf868 = empty_strided((960, 160, 1, 1), (160, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_179.run(primals_260, buf868, 153600, grid=grid(153600), stream=stream0)
del primals_260
buf869 = extern_kernels.convolution(buf867, buf868, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf869, (128, 960, 14, 14), (188160, 196, 14, 1))
buf870 = empty_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda', dtype=torch.float32)
buf871 = buf870; del buf870 # reuse
buf874 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32)
buf872 = empty_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda', dtype=torch.float32)
buf873 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32)
buf875 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_180.run(buf871, buf869, primals_442, primals_443, buf874, buf872, buf873, buf875, 960, 25088, grid=grid(960), stream=stream0)
del primals_442
del primals_443
buf876 = empty_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda', dtype=torch.float16)
buf1128 = empty_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_181.run(buf869, buf871, buf872, primals_91, primals_92, buf876, buf1128, 24084480, grid=grid(24084480), stream=stream0)
del primals_92
buf877 = empty_strided((240, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_182.run(primals_261, buf877, 2160, grid=grid(2160), stream=stream0)
del primals_261
buf878 = as_strided(buf150, (128, 240, 14, 14), (47040, 196, 14, 1)); del buf150 # reuse
triton_poi_fused_split_with_sizes_183.run(buf876, buf878, 6021120, grid=grid(6021120), stream=stream0)
buf879 = extern_kernels.convolution(buf878, buf877, stride=(2, 2), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=240, bias=None)
assert_size_stride(buf879, (128, 240, 7, 7), (11760, 49, 7, 1))
buf880 = empty_strided((240, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_184.run(primals_262, buf880, 6000, grid=grid(6000), stream=stream0)
del primals_262
buf881 = as_strided(buf147, (128, 240, 14, 14), (47040, 196, 14, 1)); del buf147 # reuse
triton_poi_fused_split_with_sizes_185.run(buf876, buf881, 6021120, grid=grid(6021120), stream=stream0)
buf882 = extern_kernels.convolution(buf881, buf880, stride=(2, 2), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=240, bias=None)
assert_size_stride(buf882, (128, 240, 7, 7), (11760, 49, 7, 1))
buf883 = empty_strided((240, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_186.run(primals_263, buf883, 11760, grid=grid(11760), stream=stream0)
del primals_263
buf884 = as_strided(buf144, (128, 240, 14, 14), (47040, 196, 14, 1)); del buf144 # reuse
triton_poi_fused_split_with_sizes_187.run(buf876, buf884, 6021120, grid=grid(6021120), stream=stream0)
buf885 = extern_kernels.convolution(buf884, buf883, stride=(2, 2), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=240, bias=None)
assert_size_stride(buf885, (128, 240, 7, 7), (11760, 49, 7, 1))
buf886 = empty_strided((240, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_188.run(primals_264, buf886, 19440, grid=grid(19440), stream=stream0)
del primals_264
buf887 = as_strided(buf141, (128, 240, 14, 14), (47040, 196, 14, 1)); del buf141 # reuse
triton_poi_fused_split_with_sizes_189.run(buf876, buf887, 6021120, grid=grid(6021120), stream=stream0)
del buf876
buf888 = extern_kernels.convolution(buf887, buf886, stride=(2, 2), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=240, bias=None)
assert_size_stride(buf888, (128, 240, 7, 7), (11760, 49, 7, 1))
buf893 = empty_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda', dtype=torch.float16)
buf889 = as_strided(buf893, (128, 240, 7, 7), (47040, 49, 7, 1)) # alias
triton_poi_fused_cat_190.run(buf879, buf889, 1505280, grid=grid(1505280), stream=stream0)
del buf879
buf890 = as_strided(buf893, (128, 240, 7, 7), (47040, 49, 7, 1), 11760) # alias
triton_poi_fused_cat_190.run(buf882, buf890, 1505280, grid=grid(1505280), stream=stream0)
del buf882
buf891 = as_strided(buf893, (128, 240, 7, 7), (47040, 49, 7, 1), 23520) # alias
triton_poi_fused_cat_190.run(buf885, buf891, 1505280, grid=grid(1505280), stream=stream0)
del buf885
buf892 = as_strided(buf893, (128, 240, 7, 7), (47040, 49, 7, 1), 35280) # alias
triton_poi_fused_cat_190.run(buf888, buf892, 1505280, grid=grid(1505280), stream=stream0)
del buf888
buf894 = buf872; del buf872 # reuse
buf895 = buf894; del buf894 # reuse
buf898 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32)
buf896 = empty_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda', dtype=torch.float32)
buf897 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32)
buf899 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_191.run(buf895, buf893, primals_445, primals_446, buf898, buf896, buf897, buf899, 960, 6272, grid=grid(960), stream=stream0)
del primals_445
del primals_446
buf900 = empty_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda', dtype=torch.float16)
buf902 = empty_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_per_fused__native_batch_norm_legit_functional_mean_silu_192.run(buf893, buf895, buf896, primals_93, primals_94, buf900, buf902, 122880, 49, grid=grid(122880), stream=stream0)
del buf896
del primals_94
buf903 = empty_strided((80, 960, 1, 1), (960, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_193.run(primals_265, buf903, 76800, grid=grid(76800), stream=stream0)
del primals_265
buf904 = buf841; del buf841 # reuse
triton_poi_fused__to_copy_convolution_168.run(primals_266, buf904, 80, grid=grid(80), stream=stream0)
del primals_266
buf905 = extern_kernels.convolution(buf902, buf903, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf905, (128, 80, 1, 1), (80, 1, 1, 1))
buf906 = buf905; del buf905 # reuse
buf907 = empty_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_convolution_silu_169.run(buf906, buf904, buf907, 10240, grid=grid(10240), stream=stream0)
del buf904
buf908 = empty_strided((960, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_194.run(primals_267, buf908, 76800, grid=grid(76800), stream=stream0)
del primals_267
buf909 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_convolution_195.run(primals_268, buf909, 960, grid=grid(960), stream=stream0)
del primals_268
buf910 = extern_kernels.convolution(buf907, buf908, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf910, (128, 960, 1, 1), (960, 1, 1, 1))
buf911 = buf910; del buf910 # reuse
triton_poi_fused__to_copy_convolution_196.run(buf911, buf909, 122880, grid=grid(122880), stream=stream0)
del buf909
buf912 = empty_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_mul_sigmoid_silu_197.run(buf900, buf911, buf912, 6021120, grid=grid(6021120), stream=stream0)
buf913 = empty_strided((264, 960, 1, 1), (960, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_198.run(primals_269, buf913, 253440, grid=grid(253440), stream=stream0)
del primals_269
buf914 = extern_kernels.convolution(buf912, buf913, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf914, (128, 264, 7, 7), (12936, 49, 7, 1))
buf915 = empty_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda', dtype=torch.float32)
buf916 = buf915; del buf915 # reuse
buf919 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
buf917 = empty_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda', dtype=torch.float32)
buf918 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
buf920 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_199.run(buf916, buf914, primals_448, primals_449, buf919, buf917, buf918, buf920, 264, 6272, grid=grid(264), stream=stream0)
del primals_448
del primals_449
buf921 = empty_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_200.run(buf914, buf916, buf917, primals_95, primals_96, buf921, 1655808, grid=grid(1655808), stream=stream0)
del primals_96
buf922 = empty_strided((1584, 264, 1, 1), (264, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_201.run(primals_270, buf922, 418176, grid=grid(418176), stream=stream0)
del primals_270
buf923 = extern_kernels.convolution(buf921, buf922, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf923, (128, 1584, 7, 7), (77616, 49, 7, 1))
buf924 = empty_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda', dtype=torch.float32)
buf925 = buf924; del buf924 # reuse
buf928 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
buf926 = empty_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda', dtype=torch.float32)
buf927 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
buf929 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_202.run(buf925, buf923, primals_451, primals_452, buf928, buf926, buf927, buf929, 1584, 6272, grid=grid(1584), stream=stream0)
del primals_451
del primals_452
buf930 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16)
buf1126 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_203.run(buf923, buf925, buf926, primals_97, primals_98, buf930, buf1126, 9934848, grid=grid(9934848), stream=stream0)
del primals_98
buf931 = empty_strided((396, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_204.run(primals_271, buf931, 3564, grid=grid(3564), stream=stream0)
del primals_271
buf932 = empty_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_split_with_sizes_205.run(buf930, buf932, 2483712, grid=grid(2483712), stream=stream0)
buf933 = extern_kernels.convolution(buf932, buf931, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None)
assert_size_stride(buf933, (128, 396, 7, 7), (19404, 49, 7, 1))
buf934 = empty_strided((396, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_206.run(primals_272, buf934, 9900, grid=grid(9900), stream=stream0)
del primals_272
buf935 = empty_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_split_with_sizes_207.run(buf930, buf935, 2483712, grid=grid(2483712), stream=stream0)
buf936 = extern_kernels.convolution(buf935, buf934, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None)
assert_size_stride(buf936, (128, 396, 7, 7), (19404, 49, 7, 1))
buf937 = empty_strided((396, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_208.run(primals_273, buf937, 19404, grid=grid(19404), stream=stream0)
del primals_273
buf938 = empty_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_split_with_sizes_209.run(buf930, buf938, 2483712, grid=grid(2483712), stream=stream0)
buf939 = extern_kernels.convolution(buf938, buf937, stride=(1, 1), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None)
assert_size_stride(buf939, (128, 396, 7, 7), (19404, 49, 7, 1))
buf940 = empty_strided((396, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_210.run(primals_274, buf940, 32076, grid=grid(32076), stream=stream0)
del primals_274
buf941 = empty_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_split_with_sizes_211.run(buf930, buf941, 2483712, grid=grid(2483712), stream=stream0)
buf942 = extern_kernels.convolution(buf941, buf940, stride=(1, 1), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None)
assert_size_stride(buf942, (128, 396, 7, 7), (19404, 49, 7, 1))
buf947 = buf930; del buf930 # reuse
buf943 = as_strided(buf947, (128, 396, 7, 7), (77616, 49, 7, 1)) # alias
triton_poi_fused_cat_212.run(buf933, buf943, 2483712, grid=grid(2483712), stream=stream0)
buf944 = as_strided(buf947, (128, 396, 7, 7), (77616, 49, 7, 1), 19404) # alias
triton_poi_fused_cat_213.run(buf936, buf944, 2483712, grid=grid(2483712), stream=stream0)
buf945 = as_strided(buf947, (128, 396, 7, 7), (77616, 49, 7, 1), 38808) # alias
triton_poi_fused_cat_213.run(buf939, buf945, 2483712, grid=grid(2483712), stream=stream0)
buf946 = as_strided(buf947, (128, 396, 7, 7), (77616, 49, 7, 1), 58212) # alias
triton_poi_fused_cat_213.run(buf942, buf946, 2483712, grid=grid(2483712), stream=stream0)
buf948 = buf926; del buf926 # reuse
buf949 = buf948; del buf948 # reuse
buf952 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
buf950 = empty_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda', dtype=torch.float32)
buf951 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
buf953 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_202.run(buf949, buf947, primals_454, primals_455, buf952, buf950, buf951, buf953, 1584, 6272, grid=grid(1584), stream=stream0)
del primals_454
del primals_455
buf954 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16)
buf956 = empty_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_per_fused__native_batch_norm_legit_functional_mean_silu_214.run(buf947, buf949, buf950, primals_99, primals_100, buf954, buf956, 202752, 49, grid=grid(202752), stream=stream0)
del primals_100
buf957 = empty_strided((132, 1584, 1, 1), (1584, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_215.run(primals_275, buf957, 209088, grid=grid(209088), stream=stream0)
del primals_275
buf958 = empty_strided((132, ), (1, ), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_convolution_216.run(primals_276, buf958, 132, grid=grid(132), stream=stream0)
del primals_276
buf959 = extern_kernels.convolution(buf956, buf957, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf959, (128, 132, 1, 1), (132, 1, 1, 1))
buf960 = buf959; del buf959 # reuse
buf961 = empty_strided((128, 132, 1, 1), (132, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_convolution_silu_217.run(buf960, buf958, buf961, 16896, grid=grid(16896), stream=stream0)
buf962 = empty_strided((1584, 132, 1, 1), (132, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_218.run(primals_277, buf962, 209088, grid=grid(209088), stream=stream0)
del primals_277
buf963 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_convolution_219.run(primals_278, buf963, 1584, grid=grid(1584), stream=stream0)
del primals_278
buf964 = extern_kernels.convolution(buf961, buf962, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf964, (128, 1584, 1, 1), (1584, 1, 1, 1))
buf965 = buf964; del buf964 # reuse
triton_poi_fused__to_copy_convolution_220.run(buf965, buf963, 202752, grid=grid(202752), stream=stream0)
buf966 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_mul_sigmoid_silu_221.run(buf954, buf965, buf966, 9934848, grid=grid(9934848), stream=stream0)
buf967 = empty_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_split_with_sizes_222.run(buf966, buf967, 4967424, grid=grid(4967424), stream=stream0)
buf968 = empty_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_split_with_sizes_223.run(buf966, buf968, 4967424, grid=grid(4967424), stream=stream0)
buf969 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_224.run(primals_279, buf969, 104544, grid=grid(104544), stream=stream0)
del primals_279
buf970 = extern_kernels.convolution(buf967, buf969, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf970, (128, 132, 7, 7), (6468, 49, 7, 1))
buf971 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_224.run(primals_280, buf971, 104544, grid=grid(104544), stream=stream0)
del primals_280
buf972 = extern_kernels.convolution(buf968, buf971, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf972, (128, 132, 7, 7), (6468, 49, 7, 1))
buf975 = empty_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda', dtype=torch.float16)
buf973 = as_strided(buf975, (128, 132, 7, 7), (12936, 49, 7, 1)) # alias
triton_poi_fused_cat_225.run(buf970, buf973, 827904, grid=grid(827904), stream=stream0)
del buf970
buf974 = as_strided(buf975, (128, 132, 7, 7), (12936, 49, 7, 1), 6468) # alias
triton_poi_fused_cat_226.run(buf972, buf974, 827904, grid=grid(827904), stream=stream0)
del buf972
buf976 = buf917; del buf917 # reuse
buf977 = buf976; del buf976 # reuse
buf980 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
buf978 = empty_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda', dtype=torch.float32)
buf979 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
buf981 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_199.run(buf977, buf975, primals_457, primals_458, buf980, buf978, buf979, buf981, 264, 6272, grid=grid(264), stream=stream0)
del primals_457
del primals_458
buf982 = empty_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_227.run(buf975, buf977, buf978, primals_101, primals_102, buf921, buf982, 1655808, grid=grid(1655808), stream=stream0)
del primals_102
buf983 = empty_strided((1584, 264, 1, 1), (264, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_201.run(primals_281, buf983, 418176, grid=grid(418176), stream=stream0)
del primals_281
buf984 = extern_kernels.convolution(buf982, buf983, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf984, (128, 1584, 7, 7), (77616, 49, 7, 1))
buf985 = buf950; del buf950 # reuse
buf986 = buf985; del buf985 # reuse
buf989 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
buf987 = empty_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda', dtype=torch.float32)
buf988 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
buf990 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_202.run(buf986, buf984, primals_460, primals_461, buf989, buf987, buf988, buf990, 1584, 6272, grid=grid(1584), stream=stream0)
del primals_460
del primals_461
buf991 = buf966; del buf966 # reuse
buf1124 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_203.run(buf984, buf986, buf987, primals_103, primals_104, buf991, buf1124, 9934848, grid=grid(9934848), stream=stream0)
del primals_104
buf992 = empty_strided((396, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_204.run(primals_282, buf992, 3564, grid=grid(3564), stream=stream0)
del primals_282
buf993 = buf942; del buf942 # reuse
triton_poi_fused_split_with_sizes_205.run(buf991, buf993, 2483712, grid=grid(2483712), stream=stream0)
buf994 = extern_kernels.convolution(buf993, buf992, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None)
assert_size_stride(buf994, (128, 396, 7, 7), (19404, 49, 7, 1))
buf995 = empty_strided((396, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_206.run(primals_283, buf995, 9900, grid=grid(9900), stream=stream0)
del primals_283
buf996 = buf939; del buf939 # reuse
triton_poi_fused_split_with_sizes_207.run(buf991, buf996, 2483712, grid=grid(2483712), stream=stream0)
buf997 = extern_kernels.convolution(buf996, buf995, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None)
assert_size_stride(buf997, (128, 396, 7, 7), (19404, 49, 7, 1))
buf998 = empty_strided((396, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_208.run(primals_284, buf998, 19404, grid=grid(19404), stream=stream0)
del primals_284
buf999 = buf936; del buf936 # reuse
triton_poi_fused_split_with_sizes_209.run(buf991, buf999, 2483712, grid=grid(2483712), stream=stream0)
buf1000 = extern_kernels.convolution(buf999, buf998, stride=(1, 1), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None)
assert_size_stride(buf1000, (128, 396, 7, 7), (19404, 49, 7, 1))
buf1001 = empty_strided((396, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_210.run(primals_285, buf1001, 32076, grid=grid(32076), stream=stream0)
del primals_285
buf1002 = buf933; del buf933 # reuse
triton_poi_fused_split_with_sizes_211.run(buf991, buf1002, 2483712, grid=grid(2483712), stream=stream0)
buf1003 = extern_kernels.convolution(buf1002, buf1001, stride=(1, 1), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None)
assert_size_stride(buf1003, (128, 396, 7, 7), (19404, 49, 7, 1))
buf1008 = buf991; del buf991 # reuse
buf1004 = as_strided(buf1008, (128, 396, 7, 7), (77616, 49, 7, 1)) # alias
triton_poi_fused_cat_212.run(buf994, buf1004, 2483712, grid=grid(2483712), stream=stream0)
buf1005 = as_strided(buf1008, (128, 396, 7, 7), (77616, 49, 7, 1), 19404) # alias
triton_poi_fused_cat_213.run(buf997, buf1005, 2483712, grid=grid(2483712), stream=stream0)
buf1006 = as_strided(buf1008, (128, 396, 7, 7), (77616, 49, 7, 1), 38808) # alias
triton_poi_fused_cat_213.run(buf1000, buf1006, 2483712, grid=grid(2483712), stream=stream0)
buf1007 = as_strided(buf1008, (128, 396, 7, 7), (77616, 49, 7, 1), 58212) # alias
triton_poi_fused_cat_213.run(buf1003, buf1007, 2483712, grid=grid(2483712), stream=stream0)
buf1009 = buf987; del buf987 # reuse
buf1010 = buf1009; del buf1009 # reuse
buf1013 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
buf1011 = empty_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda', dtype=torch.float32)
buf1012 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
buf1014 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_202.run(buf1010, buf1008, primals_463, primals_464, buf1013, buf1011, buf1012, buf1014, 1584, 6272, grid=grid(1584), stream=stream0)
del primals_463
del primals_464
buf1015 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16)
buf1017 = empty_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_per_fused__native_batch_norm_legit_functional_mean_silu_214.run(buf1008, buf1010, buf1011, primals_105, primals_106, buf1015, buf1017, 202752, 49, grid=grid(202752), stream=stream0)
del primals_106
buf1018 = empty_strided((132, 1584, 1, 1), (1584, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_215.run(primals_286, buf1018, 209088, grid=grid(209088), stream=stream0)
del primals_286
buf1019 = buf958; del buf958 # reuse
triton_poi_fused__to_copy_convolution_216.run(primals_287, buf1019, 132, grid=grid(132), stream=stream0)
del primals_287
buf1020 = extern_kernels.convolution(buf1017, buf1018, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf1020, (128, 132, 1, 1), (132, 1, 1, 1))
buf1021 = buf1020; del buf1020 # reuse
buf1022 = empty_strided((128, 132, 1, 1), (132, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_convolution_silu_217.run(buf1021, buf1019, buf1022, 16896, grid=grid(16896), stream=stream0)
buf1023 = empty_strided((1584, 132, 1, 1), (132, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_218.run(primals_288, buf1023, 209088, grid=grid(209088), stream=stream0)
del primals_288
buf1024 = buf963; del buf963 # reuse
triton_poi_fused__to_copy_convolution_219.run(primals_289, buf1024, 1584, grid=grid(1584), stream=stream0)
del primals_289
buf1025 = extern_kernels.convolution(buf1022, buf1023, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf1025, (128, 1584, 1, 1), (1584, 1, 1, 1))
buf1026 = buf1025; del buf1025 # reuse
triton_poi_fused__to_copy_convolution_220.run(buf1026, buf1024, 202752, grid=grid(202752), stream=stream0)
buf1027 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_mul_sigmoid_silu_221.run(buf1015, buf1026, buf1027, 9934848, grid=grid(9934848), stream=stream0)
buf1028 = empty_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_split_with_sizes_222.run(buf1027, buf1028, 4967424, grid=grid(4967424), stream=stream0)
buf1029 = empty_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_split_with_sizes_223.run(buf1027, buf1029, 4967424, grid=grid(4967424), stream=stream0)
buf1030 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_224.run(primals_290, buf1030, 104544, grid=grid(104544), stream=stream0)
del primals_290
buf1031 = extern_kernels.convolution(buf1028, buf1030, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf1031, (128, 132, 7, 7), (6468, 49, 7, 1))
buf1032 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_224.run(primals_291, buf1032, 104544, grid=grid(104544), stream=stream0)
del primals_291
buf1033 = extern_kernels.convolution(buf1029, buf1032, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf1033, (128, 132, 7, 7), (6468, 49, 7, 1))
buf1036 = empty_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda', dtype=torch.float16)
buf1034 = as_strided(buf1036, (128, 132, 7, 7), (12936, 49, 7, 1)) # alias
triton_poi_fused_cat_225.run(buf1031, buf1034, 827904, grid=grid(827904), stream=stream0)
del buf1031
buf1035 = as_strided(buf1036, (128, 132, 7, 7), (12936, 49, 7, 1), 6468) # alias
triton_poi_fused_cat_226.run(buf1033, buf1035, 827904, grid=grid(827904), stream=stream0)
del buf1033
buf1037 = buf978; del buf978 # reuse
buf1038 = buf1037; del buf1037 # reuse
buf1041 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
buf1039 = empty_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda', dtype=torch.float32)
buf1040 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
buf1042 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_199.run(buf1038, buf1036, primals_466, primals_467, buf1041, buf1039, buf1040, buf1042, 264, 6272, grid=grid(264), stream=stream0)
del primals_466
del primals_467
buf1043 = empty_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_227.run(buf1036, buf1038, buf1039, primals_107, primals_108, buf982, buf1043, 1655808, grid=grid(1655808), stream=stream0)
del primals_108
buf1044 = empty_strided((1584, 264, 1, 1), (264, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_201.run(primals_292, buf1044, 418176, grid=grid(418176), stream=stream0)
del primals_292
buf1045 = extern_kernels.convolution(buf1043, buf1044, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf1045, (128, 1584, 7, 7), (77616, 49, 7, 1))
buf1046 = buf1011; del buf1011 # reuse
buf1047 = buf1046; del buf1046 # reuse
buf1050 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
buf1048 = empty_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda', dtype=torch.float32)
buf1049 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
buf1051 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_202.run(buf1047, buf1045, primals_469, primals_470, buf1050, buf1048, buf1049, buf1051, 1584, 6272, grid=grid(1584), stream=stream0)
del primals_469
del primals_470
buf1052 = buf1027; del buf1027 # reuse
buf1122 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_203.run(buf1045, buf1047, buf1048, primals_109, primals_110, buf1052, buf1122, 9934848, grid=grid(9934848), stream=stream0)
del primals_110
buf1053 = empty_strided((396, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_204.run(primals_293, buf1053, 3564, grid=grid(3564), stream=stream0)
del primals_293
buf1054 = buf1003; del buf1003 # reuse
triton_poi_fused_split_with_sizes_205.run(buf1052, buf1054, 2483712, grid=grid(2483712), stream=stream0)
buf1055 = extern_kernels.convolution(buf1054, buf1053, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None)
assert_size_stride(buf1055, (128, 396, 7, 7), (19404, 49, 7, 1))
buf1056 = empty_strided((396, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_206.run(primals_294, buf1056, 9900, grid=grid(9900), stream=stream0)
del primals_294
buf1057 = buf1000; del buf1000 # reuse
triton_poi_fused_split_with_sizes_207.run(buf1052, buf1057, 2483712, grid=grid(2483712), stream=stream0)
buf1058 = extern_kernels.convolution(buf1057, buf1056, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None)
assert_size_stride(buf1058, (128, 396, 7, 7), (19404, 49, 7, 1))
buf1059 = empty_strided((396, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_208.run(primals_295, buf1059, 19404, grid=grid(19404), stream=stream0)
del primals_295
buf1060 = buf997; del buf997 # reuse
triton_poi_fused_split_with_sizes_209.run(buf1052, buf1060, 2483712, grid=grid(2483712), stream=stream0)
buf1061 = extern_kernels.convolution(buf1060, buf1059, stride=(1, 1), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None)
assert_size_stride(buf1061, (128, 396, 7, 7), (19404, 49, 7, 1))
buf1062 = empty_strided((396, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_210.run(primals_296, buf1062, 32076, grid=grid(32076), stream=stream0)
del primals_296
buf1063 = buf994; del buf994 # reuse
triton_poi_fused_split_with_sizes_211.run(buf1052, buf1063, 2483712, grid=grid(2483712), stream=stream0)
buf1064 = extern_kernels.convolution(buf1063, buf1062, stride=(1, 1), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None)
assert_size_stride(buf1064, (128, 396, 7, 7), (19404, 49, 7, 1))
buf1069 = buf1052; del buf1052 # reuse
buf1065 = as_strided(buf1069, (128, 396, 7, 7), (77616, 49, 7, 1)) # alias
triton_poi_fused_cat_212.run(buf1055, buf1065, 2483712, grid=grid(2483712), stream=stream0)
del buf1055
buf1066 = as_strided(buf1069, (128, 396, 7, 7), (77616, 49, 7, 1), 19404) # alias
triton_poi_fused_cat_213.run(buf1058, buf1066, 2483712, grid=grid(2483712), stream=stream0)
del buf1058
buf1067 = as_strided(buf1069, (128, 396, 7, 7), (77616, 49, 7, 1), 38808) # alias
triton_poi_fused_cat_213.run(buf1061, buf1067, 2483712, grid=grid(2483712), stream=stream0)
del buf1061
buf1068 = as_strided(buf1069, (128, 396, 7, 7), (77616, 49, 7, 1), 58212) # alias
triton_poi_fused_cat_213.run(buf1064, buf1068, 2483712, grid=grid(2483712), stream=stream0)
del buf1064
buf1070 = buf1048; del buf1048 # reuse
buf1071 = buf1070; del buf1070 # reuse
buf1074 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
buf1072 = empty_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda', dtype=torch.float32)
buf1073 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
buf1075 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_202.run(buf1071, buf1069, primals_472, primals_473, buf1074, buf1072, buf1073, buf1075, 1584, 6272, grid=grid(1584), stream=stream0)
del primals_472
del primals_473
buf1076 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16)
buf1078 = empty_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_per_fused__native_batch_norm_legit_functional_mean_silu_214.run(buf1069, buf1071, buf1072, primals_111, primals_112, buf1076, buf1078, 202752, 49, grid=grid(202752), stream=stream0)
del buf1072
del primals_112
buf1079 = empty_strided((132, 1584, 1, 1), (1584, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_215.run(primals_297, buf1079, 209088, grid=grid(209088), stream=stream0)
del primals_297
buf1080 = buf1019; del buf1019 # reuse
triton_poi_fused__to_copy_convolution_216.run(primals_298, buf1080, 132, grid=grid(132), stream=stream0)
del primals_298
buf1081 = extern_kernels.convolution(buf1078, buf1079, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf1081, (128, 132, 1, 1), (132, 1, 1, 1))
buf1082 = buf1081; del buf1081 # reuse
buf1083 = empty_strided((128, 132, 1, 1), (132, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_convolution_silu_217.run(buf1082, buf1080, buf1083, 16896, grid=grid(16896), stream=stream0)
del buf1080
buf1084 = empty_strided((1584, 132, 1, 1), (132, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_218.run(primals_299, buf1084, 209088, grid=grid(209088), stream=stream0)
del primals_299
buf1085 = buf1024; del buf1024 # reuse
triton_poi_fused__to_copy_convolution_219.run(primals_300, buf1085, 1584, grid=grid(1584), stream=stream0)
del primals_300
buf1086 = extern_kernels.convolution(buf1083, buf1084, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf1086, (128, 1584, 1, 1), (1584, 1, 1, 1))
buf1087 = buf1086; del buf1086 # reuse
triton_poi_fused__to_copy_convolution_220.run(buf1087, buf1085, 202752, grid=grid(202752), stream=stream0)
del buf1085
buf1088 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_mul_sigmoid_silu_221.run(buf1076, buf1087, buf1088, 9934848, grid=grid(9934848), stream=stream0)
buf1089 = empty_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_split_with_sizes_222.run(buf1088, buf1089, 4967424, grid=grid(4967424), stream=stream0)
buf1090 = empty_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused_split_with_sizes_223.run(buf1088, buf1090, 4967424, grid=grid(4967424), stream=stream0)
del buf1088
buf1091 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_224.run(primals_301, buf1091, 104544, grid=grid(104544), stream=stream0)
del primals_301
buf1092 = extern_kernels.convolution(buf1089, buf1091, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf1092, (128, 132, 7, 7), (6468, 49, 7, 1))
buf1093 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_224.run(primals_302, buf1093, 104544, grid=grid(104544), stream=stream0)
del primals_302
buf1094 = extern_kernels.convolution(buf1090, buf1093, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf1094, (128, 132, 7, 7), (6468, 49, 7, 1))
buf1097 = empty_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda', dtype=torch.float16)
buf1095 = as_strided(buf1097, (128, 132, 7, 7), (12936, 49, 7, 1)) # alias
triton_poi_fused_cat_225.run(buf1092, buf1095, 827904, grid=grid(827904), stream=stream0)
del buf1092
buf1096 = as_strided(buf1097, (128, 132, 7, 7), (12936, 49, 7, 1), 6468) # alias
triton_poi_fused_cat_226.run(buf1094, buf1096, 827904, grid=grid(827904), stream=stream0)
del buf1094
buf1098 = buf1039; del buf1039 # reuse
buf1099 = buf1098; del buf1098 # reuse
buf1102 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
buf1100 = empty_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda', dtype=torch.float32)
buf1101 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
buf1103 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_199.run(buf1099, buf1097, primals_475, primals_476, buf1102, buf1100, buf1101, buf1103, 264, 6272, grid=grid(264), stream=stream0)
del primals_475
del primals_476
buf1104 = empty_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__native_batch_norm_legit_functional_add_227.run(buf1097, buf1099, buf1100, primals_113, primals_114, buf1043, buf1104, 1655808, grid=grid(1655808), stream=stream0)
del buf1100
del primals_114
buf1105 = empty_strided((1536, 264, 1, 1), (264, 1, 1, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_228.run(primals_303, buf1105, 405504, grid=grid(405504), stream=stream0)
del primals_303
buf1106 = extern_kernels.convolution(buf1104, buf1105, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None)
assert_size_stride(buf1106, (128, 1536, 7, 7), (75264, 49, 7, 1))
buf1107 = empty_strided((1, 1536, 1, 1), (1536, 1, 1536, 1536), device='cuda', dtype=torch.float32)
buf1108 = buf1107; del buf1107 # reuse
buf1111 = empty_strided((1536, ), (1, ), device='cuda', dtype=torch.float32)
buf1109 = empty_strided((1, 1536, 1, 1), (1536, 1, 1536, 1536), device='cuda', dtype=torch.float32)
buf1110 = empty_strided((1536, ), (1, ), device='cuda', dtype=torch.float32)
buf1112 = empty_strided((1536, ), (1, ), device='cuda', dtype=torch.float32)
triton_red_fused__native_batch_norm_legit_functional_229.run(buf1108, buf1106, primals_478, primals_479, buf1111, buf1109, buf1110, buf1112, 1536, 6272, grid=grid(1536), stream=stream0)
del primals_478
del primals_479
buf1120 = empty_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda', dtype=torch.bool)
buf1115 = empty_strided((128, 1536), (1536, 1), device='cuda', dtype=torch.float16)
triton_per_fused__native_batch_norm_legit_functional_mean_relu_threshold_backward_view_230.run(buf1106, buf1108, buf1109, primals_115, primals_116, buf1120, buf1115, 196608, 49, grid=grid(196608), stream=stream0)
del buf1109
del primals_116
buf1116 = empty_strided((1000, 1536), (1536, 1), device='cuda', dtype=torch.float16)
buf1119 = empty_strided((1000, 1536), (1536, 1), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_t_231.run(primals_304, buf1116, buf1119, 1536000, grid=grid(1536000), stream=stream0)
del primals_304
buf1117 = empty_strided((1000, ), (1, ), device='cuda', dtype=torch.float16)
triton_poi_fused__to_copy_232.run(primals_305, buf1117, 1000, grid=grid(1000), stream=stream0)
del primals_305
buf1118 = empty_strided((128, 1000), (1000, 1), device='cuda', dtype=torch.float16)
extern_kernels.addmm(buf1117, buf1115, as_strided(buf1116, (1536, 1000), (1, 1536)), alpha=1, beta=1, out=buf1118)
del buf1116
del buf1117
buf1156 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_306, buf1156, 1, grid=grid(1), stream=stream0)
del primals_306
buf1157 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_309, buf1157, 1, grid=grid(1), stream=stream0)
del primals_309
buf1158 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_312, buf1158, 1, grid=grid(1), stream=stream0)
del primals_312
buf1159 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_315, buf1159, 1, grid=grid(1), stream=stream0)
del primals_315
buf1160 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_318, buf1160, 1, grid=grid(1), stream=stream0)
del primals_318
buf1161 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_321, buf1161, 1, grid=grid(1), stream=stream0)
del primals_321
buf1162 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_324, buf1162, 1, grid=grid(1), stream=stream0)
del primals_324
buf1163 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_327, buf1163, 1, grid=grid(1), stream=stream0)
del primals_327
buf1164 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_330, buf1164, 1, grid=grid(1), stream=stream0)
del primals_330
buf1165 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_333, buf1165, 1, grid=grid(1), stream=stream0)
del primals_333
buf1166 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_336, buf1166, 1, grid=grid(1), stream=stream0)
del primals_336
buf1167 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_339, buf1167, 1, grid=grid(1), stream=stream0)
del primals_339
buf1168 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_342, buf1168, 1, grid=grid(1), stream=stream0)
del primals_342
buf1169 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_345, buf1169, 1, grid=grid(1), stream=stream0)
del primals_345
buf1170 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_348, buf1170, 1, grid=grid(1), stream=stream0)
del primals_348
buf1171 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_351, buf1171, 1, grid=grid(1), stream=stream0)
del primals_351
buf1172 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_354, buf1172, 1, grid=grid(1), stream=stream0)
del primals_354
buf1173 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_357, buf1173, 1, grid=grid(1), stream=stream0)
del primals_357
buf1174 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_360, buf1174, 1, grid=grid(1), stream=stream0)
del primals_360
buf1175 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_363, buf1175, 1, grid=grid(1), stream=stream0)
del primals_363
buf1176 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_366, buf1176, 1, grid=grid(1), stream=stream0)
del primals_366
buf1177 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_369, buf1177, 1, grid=grid(1), stream=stream0)
del primals_369
buf1178 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_372, buf1178, 1, grid=grid(1), stream=stream0)
del primals_372
buf1179 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_375, buf1179, 1, grid=grid(1), stream=stream0)
del primals_375
buf1180 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_378, buf1180, 1, grid=grid(1), stream=stream0)
del primals_378
buf1181 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_381, buf1181, 1, grid=grid(1), stream=stream0)
del primals_381
buf1182 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_384, buf1182, 1, grid=grid(1), stream=stream0)
del primals_384
buf1183 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_387, buf1183, 1, grid=grid(1), stream=stream0)
del primals_387
buf1184 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_390, buf1184, 1, grid=grid(1), stream=stream0)
del primals_390
buf1185 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_393, buf1185, 1, grid=grid(1), stream=stream0)
del primals_393
buf1186 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_396, buf1186, 1, grid=grid(1), stream=stream0)
del primals_396
buf1187 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_399, buf1187, 1, grid=grid(1), stream=stream0)
del primals_399
buf1188 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_402, buf1188, 1, grid=grid(1), stream=stream0)
del primals_402
buf1189 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_405, buf1189, 1, grid=grid(1), stream=stream0)
del primals_405
buf1190 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_408, buf1190, 1, grid=grid(1), stream=stream0)
del primals_408
buf1191 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_411, buf1191, 1, grid=grid(1), stream=stream0)
del primals_411
buf1192 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_414, buf1192, 1, grid=grid(1), stream=stream0)
del primals_414
buf1193 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_417, buf1193, 1, grid=grid(1), stream=stream0)
del primals_417
buf1194 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_420, buf1194, 1, grid=grid(1), stream=stream0)
del primals_420
buf1195 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_423, buf1195, 1, grid=grid(1), stream=stream0)
del primals_423
buf1196 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_426, buf1196, 1, grid=grid(1), stream=stream0)
del primals_426
buf1197 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_429, buf1197, 1, grid=grid(1), stream=stream0)
del primals_429
buf1198 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_432, buf1198, 1, grid=grid(1), stream=stream0)
del primals_432
buf1199 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_435, buf1199, 1, grid=grid(1), stream=stream0)
del primals_435
buf1200 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_438, buf1200, 1, grid=grid(1), stream=stream0)
del primals_438
buf1201 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_441, buf1201, 1, grid=grid(1), stream=stream0)
del primals_441
buf1202 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_444, buf1202, 1, grid=grid(1), stream=stream0)
del primals_444
buf1203 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_447, buf1203, 1, grid=grid(1), stream=stream0)
del primals_447
buf1204 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_450, buf1204, 1, grid=grid(1), stream=stream0)
del primals_450
buf1205 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_453, buf1205, 1, grid=grid(1), stream=stream0)
del primals_453
buf1206 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_456, buf1206, 1, grid=grid(1), stream=stream0)
del primals_456
buf1207 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_459, buf1207, 1, grid=grid(1), stream=stream0)
del primals_459
buf1208 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_462, buf1208, 1, grid=grid(1), stream=stream0)
del primals_462
buf1209 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_465, buf1209, 1, grid=grid(1), stream=stream0)
del primals_465
buf1210 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_468, buf1210, 1, grid=grid(1), stream=stream0)
del primals_468
buf1211 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_471, buf1211, 1, grid=grid(1), stream=stream0)
del primals_471
buf1212 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_474, buf1212, 1, grid=grid(1), stream=stream0)
del primals_474
buf1213 = empty_strided((), (), device='cuda', dtype=torch.int64)
triton_poi_fused_add_233.run(primals_477, buf1213, 1, grid=grid(1), stream=stream0)
del primals_477
return (buf1156, buf9, buf10, buf1157, buf20, buf21, buf1158, buf31, buf32, buf1159, buf49, buf50, buf1160, buf68, buf69, buf1161, buf84, buf85, buf1162, buf100, buf101, buf1163, buf111, buf112, buf1164, buf127, buf128, buf1165, buf136, buf137, buf1166, buf160, buf161, buf1167, buf183, buf184, buf1168, buf197, buf198, buf1169, buf213, buf214, buf1170, buf243, buf244, buf1171, buf257, buf258, buf1172, buf273, buf274, buf1173, buf303, buf304, buf1174, buf317, buf318, buf1175, buf333, buf334, buf1176, buf363, buf364, buf1177, buf372, buf373, buf1178, buf392, buf393, buf1179, buf415, buf416, buf1180, buf429, buf430, buf1181, buf453, buf454, buf1182, buf483, buf484, buf1183, buf497, buf498, buf1184, buf521, buf522, buf1185, buf551, buf552, buf1186, buf565, buf566, buf1187, buf589, buf590, buf1188, buf619, buf620, buf1189, buf628, buf629, buf1190, buf638, buf639, buf1191, buf661, buf662, buf1192, buf675, buf676, buf1193, buf699, buf700, buf1194, buf729, buf730, buf1195, buf743, buf744, buf1196, buf767, buf768, buf1197, buf797, buf798, buf1198, buf811, buf812, buf1199, buf835, buf836, buf1200, buf865, buf866, buf1201, buf874, buf875, buf1202, buf898, buf899, buf1203, buf919, buf920, buf1204, buf928, buf929, buf1205, buf952, buf953, buf1206, buf980, buf981, buf1207, buf989, buf990, buf1208, buf1013, buf1014, buf1209, buf1041, buf1042, buf1210, buf1050, buf1051, buf1211, buf1074, buf1075, buf1212, buf1102, buf1103, buf1213, buf1111, buf1112, buf1118, primals_1, primals_3, primals_5, primals_7, primals_9, primals_11, primals_13, primals_15, primals_17, primals_19, primals_21, primals_23, primals_25, primals_27, primals_29, primals_31, primals_33, primals_35, primals_37, primals_39, primals_41, primals_43, primals_45, primals_47, primals_49, primals_51, primals_53, primals_55, primals_57, primals_59, primals_61, primals_63, primals_65, primals_67, primals_69, primals_71, primals_73, primals_75, primals_77, primals_79, primals_81, primals_83, primals_85, primals_87, primals_89, primals_91, primals_93, primals_95, primals_97, primals_99, primals_101, primals_103, primals_105, primals_107, primals_109, primals_111, primals_113, primals_115, buf0, buf1, buf2, buf8, buf11, buf12, buf13, buf19, buf22, buf23, buf24, buf30, buf34, buf35, buf36, buf38, buf42, buf48, buf52, as_strided(buf51, (128, 64, 112, 112), (2408448, 12544, 112, 1)), buf54, as_strided(buf51, (128, 64, 112, 112), (2408448, 12544, 112, 1), 802816), buf56, as_strided(buf51, (128, 64, 112, 112), (2408448, 12544, 112, 1), 1605632), buf61, buf67, buf71, as_strided(buf70, (128, 96, 56, 56), (602112, 3136, 56, 1)), buf73, as_strided(buf70, (128, 96, 56, 56), (602112, 3136, 56, 1), 301056), buf77, buf83, as_strided(buf86, (128, 20, 56, 56), (125440, 3136, 56, 1)), as_strided(buf86, (128, 20, 56, 56), (125440, 3136, 56, 1), 62720), buf87, buf89, buf93, buf99, buf102, buf103, buf104, buf110, buf114, as_strided(buf113, (128, 60, 56, 56), (376320, 3136, 56, 1)), buf116, as_strided(buf113, (128, 60, 56, 56), (376320, 3136, 56, 1), 188160), buf120, buf126, buf129, buf130, buf131, buf135, buf139, buf140, buf142, buf143, buf145, buf146, buf148, buf149, buf155, buf159, buf162, buf164, buf165, buf168, buf169, buf170, buf173, buf174, buf175, buf176, buf182, as_strided(buf185, (128, 28, 28, 28), (43904, 784, 28, 1)), as_strided(buf185, (128, 28, 28, 28), (43904, 784, 28, 1), 21952), buf186, buf188, buf192, buf196, buf200, buf201, buf203, buf204, buf208, buf212, buf215, buf217, buf218, buf221, buf222, buf223, buf226, buf228, buf229, buf230, buf232, buf236, buf242, as_strided(buf245, (128, 28, 28, 28), (43904, 784, 28, 1)), as_strided(buf245, (128, 28, 28, 28), (43904, 784, 28, 1), 21952), buf246, buf248, buf252, buf256, buf260, buf261, buf263, buf264, buf268, buf272, buf275, buf277, buf278, buf281, buf282, buf283, buf286, buf288, buf289, buf290, buf292, buf296, buf302, as_strided(buf305, (128, 28, 28, 28), (43904, 784, 28, 1)), as_strided(buf305, (128, 28, 28, 28), (43904, 784, 28, 1), 21952), buf306, buf308, buf312, buf316, buf320, buf321, buf323, buf324, buf328, buf332, buf335, buf337, buf338, buf341, buf342, buf343, buf346, buf348, buf349, buf350, buf352, buf356, buf362, buf365, buf366, buf367, buf371, buf375, buf376, buf378, buf379, buf381, buf382, buf387, buf391, buf394, buf396, buf397, buf400, buf401, buf402, buf405, buf406, buf407, buf408, buf414, as_strided(buf417, (128, 52, 14, 14), (20384, 196, 14, 1)), as_strided(buf417, (128, 52, 14, 14), (20384, 196, 14, 1), 10192), buf418, buf420, buf424, buf428, buf432, buf433, buf435, buf436, buf438, buf439, buf441, buf442, buf448, buf452, buf455, buf457, buf458, buf461, buf462, buf463, buf466, buf468, buf469, buf470, buf472, buf476, buf482, as_strided(buf485, (128, 52, 14, 14), (20384, 196, 14, 1)), as_strided(buf485, (128, 52, 14, 14), (20384, 196, 14, 1), 10192), buf486, buf488, buf492, buf496, buf500, buf501, buf503, buf504, buf506, buf507, buf509, buf510, buf516, buf520, buf523, buf525, buf526, buf529, buf530, buf531, buf534, buf536, buf537, buf538, buf540, buf544, buf550, as_strided(buf553, (128, 52, 14, 14), (20384, 196, 14, 1)), as_strided(buf553, (128, 52, 14, 14), (20384, 196, 14, 1), 10192), buf554, buf556, buf560, buf564, buf568, buf569, buf571, buf572, buf574, buf575, buf577, buf578, buf584, buf588, buf591, buf593, buf594, buf597, buf598, buf599, buf602, buf604, buf605, buf606, buf608, buf612, buf618, buf621, buf622, buf623, buf627, buf631, buf632, buf633, buf637, buf640, buf642, buf643, buf646, buf647, buf648, buf651, buf652, buf653, buf654, buf660, as_strided(buf663, (128, 80, 14, 14), (31360, 196, 14, 1)), as_strided(buf663, (128, 80, 14, 14), (31360, 196, 14, 1), 15680), buf664, buf666, buf670, buf674, buf678, buf679, buf681, buf682, buf684, buf685, buf687, buf688, buf694, buf698, buf701, buf703, buf704, buf707, buf708, buf709, buf712, buf714, buf715, buf716, buf718, buf722, buf728, as_strided(buf731, (128, 80, 14, 14), (31360, 196, 14, 1)), as_strided(buf731, (128, 80, 14, 14), (31360, 196, 14, 1), 15680), buf732, buf734, buf738, buf742, buf746, buf747, buf749, buf750, buf752, buf753, buf755, buf756, buf762, buf766, buf769, buf771, buf772, buf775, buf776, buf777, buf780, buf782, buf783, buf784, buf786, buf790, buf796, as_strided(buf799, (128, 80, 14, 14), (31360, 196, 14, 1)), as_strided(buf799, (128, 80, 14, 14), (31360, 196, 14, 1), 15680), buf800, buf802, buf806, buf810, buf814, buf815, buf817, buf818, buf820, buf821, buf823, buf824, buf830, buf834, buf837, buf839, buf840, buf843, buf844, buf845, buf848, buf850, buf851, buf852, buf854, buf858, buf864, buf867, buf868, buf869, buf873, buf877, buf878, buf880, buf881, buf883, buf884, buf886, buf887, buf893, buf897, buf900, buf902, buf903, buf906, buf907, buf908, buf911, buf912, buf913, buf914, buf918, buf921, buf922, buf923, buf927, buf931, buf932, buf934, buf935, buf937, buf938, buf940, buf941, buf947, buf951, buf954, buf956, buf957, buf960, buf961, buf962, buf965, buf967, buf968, buf969, buf971, buf975, buf979, buf982, buf983, buf984, buf988, buf992, buf993, buf995, buf996, buf998, buf999, buf1001, buf1002, buf1008, buf1012, buf1015, buf1017, buf1018, buf1021, buf1022, buf1023, buf1026, buf1028, buf1029, buf1030, buf1032, buf1036, buf1040, buf1043, buf1044, buf1045, buf1049, buf1053, buf1054, buf1056, buf1057, buf1059, buf1060, buf1062, buf1063, buf1069, buf1073, buf1076, buf1078, buf1079, buf1082, buf1083, buf1084, buf1087, buf1089, buf1090, buf1091, buf1093, buf1097, buf1101, buf1104, buf1105, buf1106, buf1110, buf1115, buf1119, buf1120, as_strided(buf1108, (1, 1536, 1, 1), (1536, 1, 1, 1)), as_strided(buf1099, (1, 264, 1, 1), (264, 1, 1, 1)), as_strided(buf1071, (1, 1584, 1, 1), (1584, 1, 1, 1)), buf1122, as_strided(buf1047, (1, 1584, 1, 1), (1584, 1, 1, 1)), as_strided(buf1038, (1, 264, 1, 1), (264, 1, 1, 1)), as_strided(buf1010, (1, 1584, 1, 1), (1584, 1, 1, 1)), buf1124, as_strided(buf986, (1, 1584, 1, 1), (1584, 1, 1, 1)), as_strided(buf977, (1, 264, 1, 1), (264, 1, 1, 1)), as_strided(buf949, (1, 1584, 1, 1), (1584, 1, 1, 1)), buf1126, as_strided(buf925, (1, 1584, 1, 1), (1584, 1, 1, 1)), as_strided(buf916, (1, 264, 1, 1), (264, 1, 1, 1)), as_strided(buf895, (1, 960, 1, 1), (960, 1, 1, 1)), buf1128, as_strided(buf871, (1, 960, 1, 1), (960, 1, 1, 1)), as_strided(buf861, (1, 160, 1, 1), (160, 1, 1, 1)), as_strided(buf832, (1, 480, 1, 1), (480, 1, 1, 1)), buf1130, as_strided(buf808, (1, 480, 1, 1), (480, 1, 1, 1)), as_strided(buf793, (1, 160, 1, 1), (160, 1, 1, 1)), as_strided(buf764, (1, 480, 1, 1), (480, 1, 1, 1)), buf1132, as_strided(buf740, (1, 480, 1, 1), (480, 1, 1, 1)), as_strided(buf725, (1, 160, 1, 1), (160, 1, 1, 1)), as_strided(buf696, (1, 480, 1, 1), (480, 1, 1, 1)), buf1134, as_strided(buf672, (1, 480, 1, 1), (480, 1, 1, 1)), as_strided(buf657, (1, 160, 1, 1), (160, 1, 1, 1)), as_strided(buf635, (1, 624, 1, 1), (624, 1, 1, 1)), buf1136, as_strided(buf625, (1, 624, 1, 1), (624, 1, 1, 1)), as_strided(buf615, (1, 104, 1, 1), (104, 1, 1, 1)), as_strided(buf586, (1, 624, 1, 1), (624, 1, 1, 1)), buf1138, as_strided(buf562, (1, 624, 1, 1), (624, 1, 1, 1)), as_strided(buf547, (1, 104, 1, 1), (104, 1, 1, 1)), as_strided(buf518, (1, 624, 1, 1), (624, 1, 1, 1)), buf1140, as_strided(buf494, (1, 624, 1, 1), (624, 1, 1, 1)), as_strided(buf479, (1, 104, 1, 1), (104, 1, 1, 1)), as_strided(buf450, (1, 624, 1, 1), (624, 1, 1, 1)), buf1142, as_strided(buf426, (1, 624, 1, 1), (624, 1, 1, 1)), as_strided(buf411, (1, 104, 1, 1), (104, 1, 1, 1)), as_strided(buf389, (1, 336, 1, 1), (336, 1, 1, 1)), buf1144, as_strided(buf369, (1, 336, 1, 1), (336, 1, 1, 1)), as_strided(buf359, (1, 56, 1, 1), (56, 1, 1, 1)), as_strided(buf330, (1, 336, 1, 1), (336, 1, 1, 1)), buf1146, as_strided(buf314, (1, 336, 1, 1), (336, 1, 1, 1)), as_strided(buf299, (1, 56, 1, 1), (56, 1, 1, 1)), as_strided(buf270, (1, 336, 1, 1), (336, 1, 1, 1)), buf1148, as_strided(buf254, (1, 336, 1, 1), (336, 1, 1, 1)), as_strided(buf239, (1, 56, 1, 1), (56, 1, 1, 1)), as_strided(buf210, (1, 336, 1, 1), (336, 1, 1, 1)), buf1150, as_strided(buf194, (1, 336, 1, 1), (336, 1, 1, 1)), as_strided(buf179, (1, 56, 1, 1), (56, 1, 1, 1)), as_strided(buf157, (1, 240, 1, 1), (240, 1, 1, 1)), buf1152, as_strided(buf133, (1, 240, 1, 1), (240, 1, 1, 1)), as_strided(buf123, (1, 40, 1, 1), (40, 1, 1, 1)), buf1153, as_strided(buf107, (1, 120, 1, 1), (120, 1, 1, 1)), as_strided(buf96, (1, 120, 1, 1), (120, 1, 1, 1)), as_strided(buf80, (1, 40, 1, 1), (40, 1, 1, 1)), buf1154, as_strided(buf64, (1, 192, 1, 1), (192, 1, 1, 1)), buf1155, as_strided(buf45, (1, 192, 1, 1), (192, 1, 1, 1)), as_strided(buf27, (1, 32, 1, 1), (32, 1, 1, 1)), as_strided(buf16, (1, 32, 1, 1), (32, 1, 1, 1)), as_strided(buf5, (1, 32, 1, 1), (32, 1, 1, 1)), )
def benchmark_compiled_module(times=10, repeat=10):
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
primals_1 = rand_strided((32, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_2 = rand_strided((32, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_3 = rand_strided((32, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_4 = rand_strided((32, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_5 = rand_strided((32, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_6 = rand_strided((32, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_7 = rand_strided((192, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_8 = rand_strided((192, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_9 = rand_strided((192, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_10 = rand_strided((192, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_11 = rand_strided((40, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_12 = rand_strided((40, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_13 = rand_strided((120, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_14 = rand_strided((120, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_15 = rand_strided((120, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_16 = rand_strided((120, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_17 = rand_strided((40, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_18 = rand_strided((40, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_19 = rand_strided((240, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_20 = rand_strided((240, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_21 = rand_strided((240, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_22 = rand_strided((240, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_23 = rand_strided((56, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_24 = rand_strided((56, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_25 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_26 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_27 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_28 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_29 = rand_strided((56, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_30 = rand_strided((56, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_31 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_32 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_33 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_34 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_35 = rand_strided((56, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_36 = rand_strided((56, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_37 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_38 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_39 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_40 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_41 = rand_strided((56, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_42 = rand_strided((56, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_43 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_44 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_45 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_46 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_47 = rand_strided((104, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_48 = rand_strided((104, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_49 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_50 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_51 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_52 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_53 = rand_strided((104, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_54 = rand_strided((104, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_55 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_56 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_57 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_58 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_59 = rand_strided((104, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_60 = rand_strided((104, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_61 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_62 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_63 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_64 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_65 = rand_strided((104, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_66 = rand_strided((104, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_67 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_68 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_69 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_70 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_71 = rand_strided((160, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_72 = rand_strided((160, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_73 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_74 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_75 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_76 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_77 = rand_strided((160, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_78 = rand_strided((160, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_79 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_80 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_81 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_82 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_83 = rand_strided((160, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_84 = rand_strided((160, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_85 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_86 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_87 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_88 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_89 = rand_strided((160, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_90 = rand_strided((160, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_91 = rand_strided((960, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_92 = rand_strided((960, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_93 = rand_strided((960, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_94 = rand_strided((960, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_95 = rand_strided((264, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_96 = rand_strided((264, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_97 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_98 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_99 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_100 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_101 = rand_strided((264, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_102 = rand_strided((264, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_103 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_104 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_105 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_106 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_107 = rand_strided((264, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_108 = rand_strided((264, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_109 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_110 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_111 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_112 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_113 = rand_strided((264, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_114 = rand_strided((264, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_115 = rand_strided((1536, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_116 = rand_strided((1536, ), (1, ), device='cuda:0', dtype=torch.float32)
primals_117 = rand_strided((32, 3, 3, 3), (27, 9, 3, 1), device='cuda:0', dtype=torch.float32)
primals_118 = rand_strided((32, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
primals_119 = rand_strided((32, 32, 1, 1), (32, 1, 1, 1), device='cuda:0', dtype=torch.float32)
primals_120 = rand_strided((96, 16, 1, 1), (16, 1, 1, 1), device='cuda:0', dtype=torch.float32)
primals_121 = rand_strided((96, 16, 1, 1), (16, 1, 1, 1), device='cuda:0', dtype=torch.float32)
primals_122 = rand_strided((64, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
primals_123 = rand_strided((64, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dty
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment