Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save shunting314/c2a4d8a28b00fcb5586d0e9d9bf77f9f to your computer and use it in GitHub Desktop.

Select an option

Save shunting314/c2a4d8a28b00fcb5586d0e9d9bf77f9f to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from torch._inductor.utils import maybe_profile
from torch import empty_strided, as_strided, device
from torch._inductor.codecache import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
aten = torch.ops.aten
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
async_compile = AsyncCompile()
import triton
import triton.language as tl
from torch._inductor.triton_heuristics import grid, start_graph, end_graph
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
# kernel path: /tmp/torchinductor_shunting/cd/ccd7accawcy3jndrehd4geosuzoh6rxkzjlqnojxb5msgrcv7udi.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type
triton_poi_fused__to_copy_0 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 864
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((32, 3, 3, 3), (27, 9, 3, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((32, 3, 3, 3), (27, 9, 3, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_0.run(*args, 864, grid=grid(864), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_0.benchmark_all_configs(*args, 864, grid=grid(864))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/4z/c4zfgpaaduweifmtujob2ufcjjkrkqa7mb5j2qaaysmstief35xb.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_1
triton_poi_fused__to_copy_1 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_1(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 19267584
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), None)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None)
def get_args():
arg_0 = rand_strided((128, 3, 224, 224), (150528, 50176, 224, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((128, 3, 224, 224), (150528, 50176, 224, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_1.run(*args, 19267584, grid=grid(19267584), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_1.benchmark_all_configs(*args, 19267584, grid=grid(19267584))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/4v/c4vmg7ssybcn2qvzgu3tqbkt62uopduyelvkcwxaqv2fc4glkb57.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_2, var_mean
triton_red_fused__native_batch_norm_legit_functional_2 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 131072],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_2(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 448
rnumel = 114688
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex % 32
x1 = (xindex // 32)
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = tl.load(in_ptr0 + ((112*(((r2 + (114688*x1)) // 112) % 112)) + (12544*x0) + (401408*((r2 + (114688*x1)) // 12544)) + (r2 % 112)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tl.store(out_ptr0 + x3, tmp2, xmask)
def get_args():
arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 32, 1, 1, 14), (448, 1, 448, 448, 32), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_2.run(*args, 448, 114688, grid=grid(448), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_2.benchmark_all_configs(*args, 448, 114688, grid=grid(448))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ax/caxxuenolffrdcs4fes5eiayfgrbnuyzkml277uyriuhejqsq7k4.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_2, convert_element_type_2, mul_1, mul_2, var_mean
triton_per_fused__native_batch_norm_legit_functional_3 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[32, 16],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_3(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 32
rnumel = 14
RBLOCK: tl.constexpr = 16
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (32*r1)), rmask & xmask, other=0)
tmp8 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 1605632.0
tmp5 = tmp3 / tmp4
tmp6 = 0.1
tmp7 = tmp5 * tmp6
tmp9 = 0.9
tmp10 = tmp8 * tmp9
tmp11 = tmp7 + tmp10
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask)
def get_args():
arg_0 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((1, 32, 1, 1, 14), (448, 1, 448, 448, 32), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_3.run(*args, 32, 14, grid=grid(32), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_3.benchmark_all_configs(*args, 32, 14, grid=grid(32))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/4s/c4sugoluux54xogsh3dgakggor26tpyllwcenrcu3hdxyyol5rhg.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_2, var_mean
triton_red_fused__native_batch_norm_legit_functional_4 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 131072],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_4(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 448
rnumel = 114688
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex % 32
x1 = (xindex // 32)
tmp2 = tl.load(in_ptr1 + (x0), xmask)
_tmp5 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = tl.load(in_ptr0 + ((112*(((r2 + (114688*x1)) // 112) % 112)) + (12544*x0) + (401408*((r2 + (114688*x1)) // 12544)) + (r2 % 112)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp4 = tmp3 * tmp3
_tmp5 = tl.where(rmask & xmask, _tmp5 + tmp4, _tmp5)
tmp5 = tl.sum(_tmp5, 1)[:, None]
tl.store(out_ptr0 + x3, tmp5, xmask)
def get_args():
arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 32, 1, 1, 14), (448, 1, 448, 448, 32), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_4.run(*args, 448, 114688, grid=grid(448), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_4.benchmark_all_configs(*args, 448, 114688, grid=grid(448))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/yz/cyzfzxq5m7mzh26evqlnxbs2f4624mxxzlbzx5v335w7yujg6hp5.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_1, add_3, convert_element_type_2, mul_3, mul_4, mul_5, rsqrt, squeeze_1, var_mean
triton_per_fused__native_batch_norm_legit_functional_5 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[32, 16],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_5(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 32
rnumel = 14
RBLOCK: tl.constexpr = 16
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (32*r1)), rmask & xmask, other=0)
tmp13 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 1605632.0
tmp5 = tmp3 / tmp4
tmp6 = 1e-05
tmp7 = tmp5 + tmp6
tmp8 = tl.math.rsqrt(tmp7)
tmp9 = 1.0000006228081046
tmp10 = tmp5 * tmp9
tmp11 = 0.1
tmp12 = tmp10 * tmp11
tmp14 = 0.9
tmp15 = tmp13 * tmp14
tmp16 = tmp12 + tmp15
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask)
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask)
tl.store(out_ptr0 + x0, tmp3, xmask)
def get_args():
arg_0 = rand_strided((1, 32, 1, 1, 14), (448, 1, 448, 448, 32), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_5.run(*args, 32, 14, grid=grid(32), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_5.benchmark_all_configs(*args, 32, 14, grid=grid(32))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/tp/ctp3rh47jmesldsykk2bkq62s2n65nizu7jrip6s664llvntz6jk.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.relu
# aten._native_batch_norm_legit_functional => add_1, add_4, convert_element_type_2, convert_element_type_3, mul, mul_6, rsqrt, sub, var_mean
# aten.relu => relu
triton_poi_fused__native_batch_norm_legit_functional_relu_6 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_relu_6(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 51380224
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 12544) % 32
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 1605632.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tl.where(0 != 0, 0, tl.where(0 > tmp15, 0, tmp15))
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp16, None)
def get_args():
arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_relu_6.run(*args, 51380224, grid=grid(51380224), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_relu_6.benchmark_all_configs(*args, 51380224, grid=grid(51380224))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/bt/cbt6gouhluogn74n5bu4s6rx7uqiiw6lwhnqaggedwz4abkhco2b.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_4
triton_poi_fused__to_copy_7 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[512], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_7(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 288
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((32, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((32, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_7.run(*args, 288, grid=grid(288), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_7.benchmark_all_configs(*args, 288, grid=grid(288))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/tq/ctqzpnsuriwiat75pazkds4p5xe54pkhlftlkhx4o3jngunwgzub.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_7
triton_poi_fused__to_copy_8 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_8(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1024
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((32, 32, 1, 1), (32, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((32, 32, 1, 1), (32, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_8.run(*args, 1024, grid=grid(1024), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_8.benchmark_all_configs(*args, 1024, grid=grid(1024))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ma/cma4u56mzuidifjz56xk54ie6sgj6png7qekm2tx4cs7sle553zr.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add
# aten._native_batch_norm_legit_functional => add_11, add_14, convert_element_type_8, convert_element_type_9, mul_14, mul_20, rsqrt_2, sub_2, var_mean_2
# aten.add => add_15
triton_poi_fused__native_batch_norm_legit_functional_add_9 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_9(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 51380224
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 12544) % 32
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp16 = tl.load(in_ptr5 + (x3), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 1605632.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp17 = tmp15 + tmp16
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp17, None)
def get_args():
arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_9.run(*args, 51380224, grid=grid(51380224), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_9.benchmark_all_configs(*args, 51380224, grid=grid(51380224))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/mf/cmfq2hg7iduys7x4z6irkzuujylw3e7gj4ysdrcbo5cemukrg3t5.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_6
triton_poi_fused_split_with_sizes_10 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_10(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 25690112
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 200704
x1 = (xindex // 200704)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (401408*x1)), None).to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 16, 112, 112), (200704, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_10.run(*args, 25690112, grid=grid(25690112), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_10.benchmark_all_configs(*args, 25690112, grid=grid(25690112))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/lr/clrdqz4vueql2no5zrrsw4bgmmsgoov4oautycbtlc4q5ypfrpxe.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_7
triton_poi_fused_split_with_sizes_11 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_11(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 25690112
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 200704
x1 = (xindex // 200704)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (200704 + x0 + (401408*x1)), None).to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 16, 112, 112), (200704, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_11.run(*args, 25690112, grid=grid(25690112), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_11.benchmark_all_configs(*args, 25690112, grid=grid(25690112))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/wk/cwkoecpvylkd2zd4p2qzrsxvdjpmlhjy2vqltsena3lm7tbpxnqv.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_10
triton_poi_fused__to_copy_12 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_12(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1536
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((96, 16, 1, 1), (16, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((96, 16, 1, 1), (16, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_12.run(*args, 1536, grid=grid(1536), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_12.benchmark_all_configs(*args, 1536, grid=grid(1536))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/yj/cyjbfwxbzmq4snbyqnu5ehgb3xenlxtbixoczmyjqmdjbuu22kqa.py
# Original ATen: aten.cat
# aten.cat => cat
triton_poi_fused_cat_13 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[268435456], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_13(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 154140672
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 1204224
x1 = (xindex // 1204224)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (2408448*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 96, 112, 112), (1204224, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 96, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_13.run(*args, 154140672, grid=grid(154140672), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_13.benchmark_all_configs(*args, 154140672, grid=grid(154140672))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/nh/cnh3nbcj2c5nraypyukh7gkg7ocvsbjrgj7ok7wc472nt34wy65s.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_12, var_mean_3
triton_red_fused__native_batch_norm_legit_functional_14 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[4096, 131072],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_14(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 2496
rnumel = 123511
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x1 = (xindex // 192)
x0 = xindex % 192
_tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = r2 + (123511*x1)
tmp1 = 1605632
tmp2 = tmp0 < tmp1
tmp3 = tl.load(in_ptr0 + ((12544*x0) + (2408448*(((r2 + (123511*x1)) // 12544) % 128)) + ((r2 + (123511*x1)) % 12544) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp4 = tmp3.to(tl.float32)
tmp5 = tl.where(tmp2, tmp4, 0)
_tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6)
tmp6 = tl.sum(_tmp6, 1)[:, None]
tl.store(out_ptr0 + x3, tmp6, xmask)
def get_args():
arg_0 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 192, 1, 1, 13), (2496, 1, 2496, 2496, 192), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_14.run(*args, 2496, 123511, grid=grid(2496), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_14.benchmark_all_configs(*args, 2496, 123511, grid=grid(2496))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ed/cedbp3dhcc6xvctcve537pwhmiwi2ebf5oi5f5bppm5whslbp34w.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_18, convert_element_type_12, mul_22, mul_23, var_mean_3
triton_per_fused__native_batch_norm_legit_functional_15 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[256, 16],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_15(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 192
rnumel = 13
RBLOCK: tl.constexpr = 16
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (192*r1)), rmask & xmask, other=0)
tmp8 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 1605632.0
tmp5 = tmp3 / tmp4
tmp6 = 0.1
tmp7 = tmp5 * tmp6
tmp9 = 0.9
tmp10 = tmp8 * tmp9
tmp11 = tmp7 + tmp10
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask)
def get_args():
arg_0 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((1, 192, 1, 1, 13), (2496, 1, 2496, 2496, 192), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_15.run(*args, 192, 13, grid=grid(192), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_15.benchmark_all_configs(*args, 192, 13, grid=grid(192))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/jk/cjk2vm3446xrk7rth7hr6pun7xxo3dnzubwcn6ydrpifal4eykrz.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_12, var_mean_3
triton_red_fused__native_batch_norm_legit_functional_16 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[4096, 131072],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_16(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 2496
rnumel = 123511
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x1 = (xindex // 192)
x0 = xindex % 192
_tmp9 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = r2 + (123511*x1)
tmp1 = 1605632
tmp2 = tmp0 < tmp1
tmp3 = tl.load(in_ptr0 + ((12544*x0) + (2408448*(((r2 + (123511*x1)) // 12544) % 128)) + ((r2 + (123511*x1)) % 12544) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp4 = tmp3.to(tl.float32)
tmp5 = tl.load(in_ptr1 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0)
tmp6 = tmp4 - tmp5
tmp7 = tmp6 * tmp6
tmp8 = tl.where(tmp2, tmp7, 0)
_tmp9 = tl.where(rmask & xmask, _tmp9 + tmp8, _tmp9)
tmp9 = tl.sum(_tmp9, 1)[:, None]
tl.store(out_ptr0 + x3, tmp9, xmask)
def get_args():
arg_0 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 192, 1, 1, 13), (2496, 1, 2496, 2496, 192), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_16.run(*args, 2496, 123511, grid=grid(2496), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_16.benchmark_all_configs(*args, 2496, 123511, grid=grid(2496))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/r2/cr2l4iahxzbm4xda53lhc7nels7ttppa4wfso67ohv6lwu4ut2ei.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_17, add_19, convert_element_type_12, mul_24, mul_25, mul_26, rsqrt_3, squeeze_10, var_mean_3
triton_per_fused__native_batch_norm_legit_functional_17 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[256, 16],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_17(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 192
rnumel = 13
RBLOCK: tl.constexpr = 16
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (192*r1)), rmask & xmask, other=0)
tmp13 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 1605632.0
tmp5 = tmp3 / tmp4
tmp6 = 1e-05
tmp7 = tmp5 + tmp6
tmp8 = tl.math.rsqrt(tmp7)
tmp9 = 1.0000006228081046
tmp10 = tmp5 * tmp9
tmp11 = 0.1
tmp12 = tmp10 * tmp11
tmp14 = 0.9
tmp15 = tmp13 * tmp14
tmp16 = tmp12 + tmp15
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask)
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask)
tl.store(out_ptr0 + x0, tmp3, xmask)
def get_args():
arg_0 = rand_strided((1, 192, 1, 1, 13), (2496, 1, 2496, 2496, 192), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_17.run(*args, 192, 13, grid=grid(192), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_17.benchmark_all_configs(*args, 192, 13, grid=grid(192))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/7k/c7k35ubu25j3ryiaxnuo2fjojvhqvy67tox3wtaa4knc5l4vzqby.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.relu, aten.threshold_backward
# aten._native_batch_norm_legit_functional => add_17, add_20, convert_element_type_12, convert_element_type_13, mul_21, mul_27, rsqrt_3, sub_3, var_mean_3
# aten.relu => relu_2
# aten.threshold_backward => le_4
triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_18 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[536870912], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*i1', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_18(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
xnumel = 308281344
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 12544) % 192
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 1605632.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tl.where(0 != 0, 0, tl.where(0 > tmp15, 0, tmp15))
tmp17 = 0.0
tmp18 = tmp16 <= tmp17
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp16, None)
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp18, None)
def get_args():
arg_0 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.bool)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_18.run(*args, 308281344, grid=grid(308281344), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_18.benchmark_all_configs(*args, 308281344, grid=grid(308281344))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/db/cdbk64vwqs4pzrtrb762fxgfklb7q573zm3neisxuhpcneczshvk.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_14
triton_poi_fused__to_copy_19 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_19(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 576
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((64, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((64, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_19.run(*args, 576, grid=grid(576), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_19.benchmark_all_configs(*args, 576, grid=grid(576))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/fq/cfq5oxsb4jr7dbzocvxxr77nzkg2jk4k3rsmhwdpzkoisxiavav5.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_15
triton_poi_fused__to_copy_20 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_20(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1600
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((64, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((64, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_20.run(*args, 1600, grid=grid(1600), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_20.benchmark_all_configs(*args, 1600, grid=grid(1600))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/fo/cfoj5fwz53k7xkfcu6z676snb5mum4wtda3diyiuxkkhrnlissnf.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_16
triton_poi_fused__to_copy_21 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_21(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3136
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((64, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((64, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_21.run(*args, 3136, grid=grid(3136), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_21.benchmark_all_configs(*args, 3136, grid=grid(3136))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ye/cyesscl4aqq7rzyjhll5ppohr4h424gcitducugsmauai5wnvgy5.py
# Original ATen: aten.cat
# aten.cat => cat_1
triton_poi_fused_cat_22 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_22(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 25690112
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 200704
x1 = (xindex // 200704)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (602112*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 64, 56, 56), (200704, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 64, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_22.run(*args, 25690112, grid=grid(25690112), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_22.benchmark_all_configs(*args, 25690112, grid=grid(25690112))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/zm/czmjlzsknic276jj2ulhzekdx7suavkcesp75xqpnfbntqprzsrx.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_17, var_mean_4
triton_red_fused__native_batch_norm_legit_functional_23 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 131072],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_23(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 768
rnumel = 100352
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex % 192
x1 = (xindex // 192)
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = tl.load(in_ptr0 + ((3136*x0) + (602112*(r2 // 3136)) + (19267584*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tl.store(out_ptr0 + x3, tmp2, xmask)
def get_args():
arg_0 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 192, 1, 1, 4), (768, 1, 768, 768, 192), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_23.run(*args, 768, 100352, grid=grid(768), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_23.benchmark_all_configs(*args, 768, 100352, grid=grid(768))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/zp/czpkjhwvqvlhuxsldhiwsf77gmyfxqbtysyyhp7lberdnzehm42l.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_23, convert_element_type_17, mul_29, mul_30, var_mean_4
triton_per_fused__native_batch_norm_legit_functional_24 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[256, 4],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_24(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 192
rnumel = 4
RBLOCK: tl.constexpr = 4
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (192*r1)), rmask & xmask, other=0)
tmp8 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 401408.0
tmp5 = tmp3 / tmp4
tmp6 = 0.1
tmp7 = tmp5 * tmp6
tmp9 = 0.9
tmp10 = tmp8 * tmp9
tmp11 = tmp7 + tmp10
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask)
def get_args():
arg_0 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((1, 192, 1, 1, 4), (768, 1, 768, 768, 192), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_24.run(*args, 192, 4, grid=grid(192), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_24.benchmark_all_configs(*args, 192, 4, grid=grid(192))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/4o/c4ohubk3q7bkigrpxxijrzzlhs5bqi6qcn3fxomltgipjmowqgd2.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_17, var_mean_4
triton_red_fused__native_batch_norm_legit_functional_25 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 131072],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_25(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 768
rnumel = 100352
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex % 192
x1 = (xindex // 192)
tmp2 = tl.load(in_ptr1 + (x0), xmask)
_tmp5 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = tl.load(in_ptr0 + ((3136*x0) + (602112*(r2 // 3136)) + (19267584*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp4 = tmp3 * tmp3
_tmp5 = tl.where(rmask & xmask, _tmp5 + tmp4, _tmp5)
tmp5 = tl.sum(_tmp5, 1)[:, None]
tl.store(out_ptr0 + x3, tmp5, xmask)
def get_args():
arg_0 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 192, 1, 1, 4), (768, 1, 768, 768, 192), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_25.run(*args, 768, 100352, grid=grid(768), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_25.benchmark_all_configs(*args, 768, 100352, grid=grid(768))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/kw/ckworjll2g6m5sbalty5hh27jhwuiyj3iqglt67oatdxz54d3igy.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_22, add_24, convert_element_type_17, mul_31, mul_32, mul_33, rsqrt_4, squeeze_13, var_mean_4
triton_per_fused__native_batch_norm_legit_functional_26 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[256, 4],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_26(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 192
rnumel = 4
RBLOCK: tl.constexpr = 4
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (192*r1)), rmask & xmask, other=0)
tmp13 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 401408.0
tmp5 = tmp3 / tmp4
tmp6 = 1e-05
tmp7 = tmp5 + tmp6
tmp8 = tl.math.rsqrt(tmp7)
tmp9 = 1.0000024912370735
tmp10 = tmp5 * tmp9
tmp11 = 0.1
tmp12 = tmp10 * tmp11
tmp14 = 0.9
tmp15 = tmp13 * tmp14
tmp16 = tmp12 + tmp15
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask)
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask)
tl.store(out_ptr0 + x0, tmp3, xmask)
def get_args():
arg_0 = rand_strided((1, 192, 1, 1, 4), (768, 1, 768, 768, 192), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_26.run(*args, 192, 4, grid=grid(192), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_26.benchmark_all_configs(*args, 192, 4, grid=grid(192))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/rb/crbx6irdjcatfhmyhdhxdmxoun65x4lirw3usotlxxknzb3rnzo3.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.relu, aten.threshold_backward
# aten._native_batch_norm_legit_functional => add_22, add_25, convert_element_type_17, convert_element_type_18, mul_28, mul_34, rsqrt_4, sub_4, var_mean_4
# aten.relu => relu_3
# aten.threshold_backward => le_3
triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_27 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[134217728], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*i1', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_27(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
xnumel = 77070336
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 3136) % 192
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 401408.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tl.where(0 != 0, 0, tl.where(0 > tmp15, 0, tmp15))
tmp17 = 0.0
tmp18 = tmp16 <= tmp17
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp16, None)
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp18, None)
def get_args():
arg_0 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.bool)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_27.run(*args, 77070336, grid=grid(77070336), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_27.benchmark_all_configs(*args, 77070336, grid=grid(77070336))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/yr/cyrrsvkikbwrkovp7p3kp54xrx3kv5k3cuej452rtgigaetgnkzf.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_19
triton_poi_fused__to_copy_28 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_28(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1920
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((20, 96, 1, 1), (96, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((20, 96, 1, 1), (96, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_28.run(*args, 1920, grid=grid(1920), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_28.benchmark_all_configs(*args, 1920, grid=grid(1920))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/lx/clxgqhjnos6ucboracnh3kmrmn5mwuc3sgkffihrftfhuwl64wg2.py
# Original ATen: aten.cat
# aten.cat => cat_2
triton_poi_fused_cat_29 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_29(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 8028160
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 62720
x1 = (xindex // 62720)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (125440*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 20, 56, 56), (62720, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 20, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_29.run(*args, 8028160, grid=grid(8028160), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_29.benchmark_all_configs(*args, 8028160, grid=grid(8028160))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/e3/ce3edr4giy7ygyubb445dsh6wavenlh7yhdxr5odchwzb5blm45m.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_21, var_mean_5
triton_red_fused__native_batch_norm_legit_functional_30 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 32768],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_30(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 520
rnumel = 30878
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x1 = (xindex // 40)
x0 = xindex % 40
_tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = r2 + (30878*x1)
tmp1 = 401408
tmp2 = tmp0 < tmp1
tmp3 = tl.load(in_ptr0 + ((3136*x0) + (125440*(((r2 + (30878*x1)) // 3136) % 128)) + ((r2 + (30878*x1)) % 3136) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp4 = tmp3.to(tl.float32)
tmp5 = tl.where(tmp2, tmp4, 0)
_tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6)
tmp6 = tl.sum(_tmp6, 1)[:, None]
tl.store(out_ptr0 + x3, tmp6, xmask)
def get_args():
arg_0 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 40, 1, 1, 13), (520, 1, 520, 520, 40), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_30.run(*args, 520, 30878, grid=grid(520), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_30.benchmark_all_configs(*args, 520, 30878, grid=grid(520))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/5l/c5ldagjb25qlt4w6hnm2s5f7x7ddcw2evdosdg3pvhtpoduxelhs.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_28, convert_element_type_21, mul_36, mul_37, var_mean_5
triton_per_fused__native_batch_norm_legit_functional_31 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[64, 16],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_31(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 40
rnumel = 13
RBLOCK: tl.constexpr = 16
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (40*r1)), rmask & xmask, other=0)
tmp8 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 401408.0
tmp5 = tmp3 / tmp4
tmp6 = 0.1
tmp7 = tmp5 * tmp6
tmp9 = 0.9
tmp10 = tmp8 * tmp9
tmp11 = tmp7 + tmp10
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask)
def get_args():
arg_0 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((1, 40, 1, 1, 13), (520, 1, 520, 520, 40), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_31.run(*args, 40, 13, grid=grid(40), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_31.benchmark_all_configs(*args, 40, 13, grid=grid(40))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ko/ckomyuegfrpkii72qwrquhfy6eh53az7jeqam3ghl4ajcmcw2h3l.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_21, var_mean_5
triton_red_fused__native_batch_norm_legit_functional_32 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 32768],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_32(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 520
rnumel = 30878
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x1 = (xindex // 40)
x0 = xindex % 40
_tmp9 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = r2 + (30878*x1)
tmp1 = 401408
tmp2 = tmp0 < tmp1
tmp3 = tl.load(in_ptr0 + ((3136*x0) + (125440*(((r2 + (30878*x1)) // 3136) % 128)) + ((r2 + (30878*x1)) % 3136) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp4 = tmp3.to(tl.float32)
tmp5 = tl.load(in_ptr1 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0)
tmp6 = tmp4 - tmp5
tmp7 = tmp6 * tmp6
tmp8 = tl.where(tmp2, tmp7, 0)
_tmp9 = tl.where(rmask & xmask, _tmp9 + tmp8, _tmp9)
tmp9 = tl.sum(_tmp9, 1)[:, None]
tl.store(out_ptr0 + x3, tmp9, xmask)
def get_args():
arg_0 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 40, 1, 1, 13), (520, 1, 520, 520, 40), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_32.run(*args, 520, 30878, grid=grid(520), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_32.benchmark_all_configs(*args, 520, 30878, grid=grid(520))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/5p/c5pcmnlklk6a6ee2vm3ta2iounjyiftuynwfzbts7sayeuto4jvn.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_27, add_29, convert_element_type_21, mul_38, mul_39, mul_40, rsqrt_5, squeeze_16, var_mean_5
triton_per_fused__native_batch_norm_legit_functional_33 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[64, 16],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_33(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 40
rnumel = 13
RBLOCK: tl.constexpr = 16
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (40*r1)), rmask & xmask, other=0)
tmp13 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 401408.0
tmp5 = tmp3 / tmp4
tmp6 = 1e-05
tmp7 = tmp5 + tmp6
tmp8 = tl.math.rsqrt(tmp7)
tmp9 = 1.0000024912370735
tmp10 = tmp5 * tmp9
tmp11 = 0.1
tmp12 = tmp10 * tmp11
tmp14 = 0.9
tmp15 = tmp13 * tmp14
tmp16 = tmp12 + tmp15
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask)
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask)
tl.store(out_ptr0 + x0, tmp3, xmask)
def get_args():
arg_0 = rand_strided((1, 40, 1, 1, 13), (520, 1, 520, 520, 40), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_33.run(*args, 40, 13, grid=grid(40), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_33.benchmark_all_configs(*args, 40, 13, grid=grid(40))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/pf/cpfbdly4ax5zztymk42pnezqf7ys2im4sb437kjrqjpf4x2jbotn.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_27, add_30, convert_element_type_21, convert_element_type_22, mul_35, mul_41, rsqrt_5, sub_5, var_mean_5
triton_poi_fused__native_batch_norm_legit_functional_34 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_34(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16056320
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 3136) % 40
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 401408.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None)
def get_args():
arg_0 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_34.run(*args, 16056320, grid=grid(16056320), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_34.benchmark_all_configs(*args, 16056320, grid=grid(16056320))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/jt/cjtdu5nq2b37l5fyi5c7vdk3m4aqhqeqbsw3e5ifo4u74lo5bgaq.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_23
triton_poi_fused__to_copy_35 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_35(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1200
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((60, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((60, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_35.run(*args, 1200, grid=grid(1200), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_35.benchmark_all_configs(*args, 1200, grid=grid(1200))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/wj/cwjg3r37ehsowju7qerrsqlothtrltxyyuq5mhnulbp7zor3ocw5.py
# Original ATen: aten.cat
# aten.cat => cat_3
triton_poi_fused_cat_36 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_36(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 24084480
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 188160
x1 = (xindex // 188160)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (376320*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 60, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_36.run(*args, 24084480, grid=grid(24084480), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_36.benchmark_all_configs(*args, 24084480, grid=grid(24084480))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/2c/c2cmgfhoarotsw2klaplncsqdxdxrbc3je4bm4idg4672cm2hbzh.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_25, var_mean_6
triton_red_fused__native_batch_norm_legit_functional_37 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 131072],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_37(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 480
rnumel = 100352
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex % 120
x1 = (xindex // 120)
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = tl.load(in_ptr0 + ((3136*x0) + (376320*(r2 // 3136)) + (12042240*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tl.store(out_ptr0 + x3, tmp2, xmask)
def get_args():
arg_0 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 120, 1, 1, 4), (480, 1, 480, 480, 120), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_37.run(*args, 480, 100352, grid=grid(480), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_37.benchmark_all_configs(*args, 480, 100352, grid=grid(480))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/xq/cxq23fscob3jbjdcsao7sicyvado5nsvykscbw7syq3dh73swjgu.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_33, convert_element_type_25, mul_43, mul_44, var_mean_6
triton_per_fused__native_batch_norm_legit_functional_38 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[128, 4],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_38(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 120
rnumel = 4
RBLOCK: tl.constexpr = 4
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (120*r1)), rmask & xmask, other=0)
tmp8 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 401408.0
tmp5 = tmp3 / tmp4
tmp6 = 0.1
tmp7 = tmp5 * tmp6
tmp9 = 0.9
tmp10 = tmp8 * tmp9
tmp11 = tmp7 + tmp10
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask)
def get_args():
arg_0 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((1, 120, 1, 1, 4), (480, 1, 480, 480, 120), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_38.run(*args, 120, 4, grid=grid(120), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_38.benchmark_all_configs(*args, 120, 4, grid=grid(120))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/wg/cwgir4s67j2t35f66rtqguw4skzrnzaazh3rvukzvjln6bwfquwz.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_25, var_mean_6
triton_red_fused__native_batch_norm_legit_functional_39 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 131072],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_39(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 480
rnumel = 100352
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex % 120
x1 = (xindex // 120)
tmp2 = tl.load(in_ptr1 + (x0), xmask)
_tmp5 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = tl.load(in_ptr0 + ((3136*x0) + (376320*(r2 // 3136)) + (12042240*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp4 = tmp3 * tmp3
_tmp5 = tl.where(rmask & xmask, _tmp5 + tmp4, _tmp5)
tmp5 = tl.sum(_tmp5, 1)[:, None]
tl.store(out_ptr0 + x3, tmp5, xmask)
def get_args():
arg_0 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 120, 1, 1, 4), (480, 1, 480, 480, 120), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_39.run(*args, 480, 100352, grid=grid(480), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_39.benchmark_all_configs(*args, 480, 100352, grid=grid(480))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/bd/cbdjjrt2eltvde4ay5q7ftrif7dzdrcgxsonoef3wv6a6n446fg5.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_32, add_34, convert_element_type_25, mul_45, mul_46, mul_47, rsqrt_6, squeeze_19, var_mean_6
triton_per_fused__native_batch_norm_legit_functional_40 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[128, 4],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_40(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 120
rnumel = 4
RBLOCK: tl.constexpr = 4
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (120*r1)), rmask & xmask, other=0)
tmp13 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 401408.0
tmp5 = tmp3 / tmp4
tmp6 = 1e-05
tmp7 = tmp5 + tmp6
tmp8 = tl.math.rsqrt(tmp7)
tmp9 = 1.0000024912370735
tmp10 = tmp5 * tmp9
tmp11 = 0.1
tmp12 = tmp10 * tmp11
tmp14 = 0.9
tmp15 = tmp13 * tmp14
tmp16 = tmp12 + tmp15
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask)
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask)
tl.store(out_ptr0 + x0, tmp3, xmask)
def get_args():
arg_0 = rand_strided((1, 120, 1, 1, 4), (480, 1, 480, 480, 120), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_40.run(*args, 120, 4, grid=grid(120), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_40.benchmark_all_configs(*args, 120, 4, grid=grid(120))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ff/cffbiktpj5b36houxciovjfe3g7ji5w5ibbvgjpjdr6m6dxnqiod.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.relu
# aten._native_batch_norm_legit_functional => add_32, add_35, convert_element_type_25, convert_element_type_26, mul_42, mul_48, rsqrt_6, sub_6, var_mean_6
# aten.relu => relu_4
triton_poi_fused__native_batch_norm_legit_functional_relu_41 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_relu_41(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 48168960
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 3136) % 120
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 401408.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tl.where(0 != 0, 0, tl.where(0 > tmp15, 0, tmp15))
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp16, None)
def get_args():
arg_0 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_relu_41.run(*args, 48168960, grid=grid(48168960), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_relu_41.benchmark_all_configs(*args, 48168960, grid=grid(48168960))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/6y/c6ynio5pu6llppeohanybeuevixt5rgpdgxw24ylgggmbgzjacrt.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_27
triton_poi_fused__to_copy_42 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_42(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1080
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_42.run(*args, 1080, grid=grid(1080), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_42.benchmark_all_configs(*args, 1080, grid=grid(1080))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/p3/cp3ysrijswutvs72wkv4ypuaradvydgclkx6swvsdiolmqtydjck.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.relu, aten.threshold_backward
# aten._native_batch_norm_legit_functional => add_37, add_40, convert_element_type_28, convert_element_type_29, mul_49, mul_55, rsqrt_7, sub_7, var_mean_7
# aten.relu => relu_5
# aten.threshold_backward => le_1
triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_43 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*i1', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_43(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
xnumel = 48168960
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 3136) % 120
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 401408.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tl.where(0 != 0, 0, tl.where(0 > tmp15, 0, tmp15))
tmp17 = 0.0
tmp18 = tmp16 <= tmp17
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp16, None)
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp18, None)
def get_args():
arg_0 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.bool)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_43.run(*args, 48168960, grid=grid(48168960), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_43.benchmark_all_configs(*args, 48168960, grid=grid(48168960))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/pe/cpea2gtzlc2kbhqh3fj4j3lqcnuxq36vpr46nhhxpvtecppidlkn.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_30
triton_poi_fused__to_copy_44 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_44(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1200
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((20, 60, 1, 1), (60, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((20, 60, 1, 1), (60, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_44.run(*args, 1200, grid=grid(1200), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_44.benchmark_all_configs(*args, 1200, grid=grid(1200))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/nm/cnmhyqb22hbicwkw553bytawf6yat3hnc5u573hgyyuw5vi2ohc2.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add
# aten._native_batch_norm_legit_functional => add_42, add_45, convert_element_type_32, convert_element_type_33, mul_56, mul_62, rsqrt_8, sub_8, var_mean_8
# aten.add => add_46
triton_poi_fused__native_batch_norm_legit_functional_add_45 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_45(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16056320
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 3136) % 40
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp16 = tl.load(in_ptr5 + (x3), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 401408.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp17 = tmp15 + tmp16
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp17, None)
def get_args():
arg_0 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_45.run(*args, 16056320, grid=grid(16056320), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_45.benchmark_all_configs(*args, 16056320, grid=grid(16056320))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/d6/cd6uk6iyi5u7h7amewzycxkyxriny4cknptmukvtlpxi4jwlerzf.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_34
triton_poi_fused__to_copy_46 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_46(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 9600
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((240, 40, 1, 1), (40, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((240, 40, 1, 1), (40, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_46.run(*args, 9600, grid=grid(9600), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_46.benchmark_all_configs(*args, 9600, grid=grid(9600))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/p3/cp3dwfqdvlpfdvfvxrfm2lik34mpytuijuqsswm7efzikdczro7d.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_48, add_49, add_50, convert_element_type_35, mul_64, mul_65, mul_66, mul_67, mul_68, rsqrt_9, squeeze_28, var_mean_9
triton_red_fused__native_batch_norm_legit_functional_47 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[256, 524288],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_47(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 240
rnumel = 401408
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 3136
r2 = (rindex // 3136)
tmp0 = tl.load(in_ptr0 + (r1 + (3136*x0) + (752640*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tmp7 = tl.load(in_ptr1 + (x0), xmask)
tmp3 = 401408.0
tmp4 = tmp2 / tmp3
tmp5 = 0.1
tmp6 = tmp4 * tmp5
tmp8 = 0.9
tmp9 = tmp7 * tmp8
tmp10 = tmp6 + tmp9
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 3136
r2 = (rindex // 3136)
tmp11 = tl.load(in_ptr0 + (r1 + (3136*x0) + (752640*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp12 = tmp11.to(tl.float32)
tmp13 = tmp12 - tmp4
tmp14 = tmp13 * tmp13
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
tmp15 = tl.sum(_tmp15, 1)[:, None]
tl.store(out_ptr1 + x0, tmp15, xmask)
tmp25 = tl.load(in_ptr2 + (x0), xmask)
tmp16 = 401408.0
tmp17 = tmp15 / tmp16
tmp18 = 1e-05
tmp19 = tmp17 + tmp18
tmp20 = tl.math.rsqrt(tmp19)
tmp21 = 1.0000024912370735
tmp22 = tmp17 * tmp21
tmp23 = 0.1
tmp24 = tmp22 * tmp23
tmp26 = 0.9
tmp27 = tmp25 * tmp26
tmp28 = tmp24 + tmp27
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)
def get_args():
arg_0 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_47.run(*args, 240, 401408, grid=grid(240), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_47.benchmark_all_configs(*args, 240, 401408, grid=grid(240))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/bn/cbnwrcxmnryz6qd2vzr5qnxb2q3vepfvhupqz2awsu4zkworteie.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub
# aten._native_batch_norm_legit_functional => add_48, add_51, convert_element_type_35, convert_element_type_36, mul_63, mul_69, rsqrt_9, sub_9, var_mean_9
# aten.add => add_379
# aten.clone => clone
# aten.fill => full_like_47
# aten.mul => mul_1107, mul_1108
# aten.sigmoid => sigmoid_111
# aten.sub => sub_313
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_48 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[134217728], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_48(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
xnumel = 96337920
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 3136) % 240
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 401408.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tl.sigmoid(tmp15)
tmp17 = 1.0
tmp18 = tmp17 - tmp16
tmp19 = tmp15 * tmp18
tmp20 = tmp19 + tmp17
tmp21 = tmp16 * tmp20
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None)
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp21, None)
def get_args():
arg_0 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_48.run(*args, 96337920, grid=grid(96337920), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_48.benchmark_all_configs(*args, 96337920, grid=grid(96337920))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/tr/ctrlm7wxwgyqu6igydzh2qy3dgfyvkbxyf6ed5dmhs5ac43nkywg.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_39
triton_poi_fused__to_copy_49 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_49(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 540
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((60, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((60, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_49.run(*args, 540, grid=grid(540), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_49.benchmark_all_configs(*args, 540, grid=grid(540))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/62/c62vp5bgz7fqdu2x3oxuvrnwwvzztezlji4zzzxqegy77wuqckqo.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_52
triton_poi_fused_split_with_sizes_50 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_50(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 24084480
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 188160
x1 = (xindex // 188160)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (752640*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_50.run(*args, 24084480, grid=grid(24084480), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_50.benchmark_all_configs(*args, 24084480, grid=grid(24084480))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/5k/c5k2puyq6twbirobmys3yfadgomf23ufkkjsshzn37upfh5g7erl.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_40
triton_poi_fused__to_copy_51 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_51(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1500
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((60, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((60, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_51.run(*args, 1500, grid=grid(1500), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_51.benchmark_all_configs(*args, 1500, grid=grid(1500))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/qq/cqqmootvvozbchj4wyxl2cmg4iebq6hs2v2baytcez4tdix7pywu.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_57
triton_poi_fused_split_with_sizes_52 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_52(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 24084480
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 188160
x1 = (xindex // 188160)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (188160 + x0 + (752640*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_52.run(*args, 24084480, grid=grid(24084480), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_52.benchmark_all_configs(*args, 24084480, grid=grid(24084480))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/hn/chn7fi5d6pb7lt74jnnjikqswlxeu3nbv3mpu3aidnxkwdaw4fmq.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_41
triton_poi_fused__to_copy_53 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_53(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2940
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((60, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((60, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_53.run(*args, 2940, grid=grid(2940), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_53.benchmark_all_configs(*args, 2940, grid=grid(2940))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/7d/c7d6lruapdg7jujcavuay77hawcbzn2rk3uqz6opu54tj6rxd3dc.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_62
triton_poi_fused_split_with_sizes_54 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_54(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 24084480
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 188160
x1 = (xindex // 188160)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (376320 + x0 + (752640*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_54.run(*args, 24084480, grid=grid(24084480), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_54.benchmark_all_configs(*args, 24084480, grid=grid(24084480))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ih/cihdtphelycdnabls3bvxcjqdwyigw5444zlobgvmbwdwzxq7zdy.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_42
triton_poi_fused__to_copy_55 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_55(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4860
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((60, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((60, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_55.run(*args, 4860, grid=grid(4860), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_55.benchmark_all_configs(*args, 4860, grid=grid(4860))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/oq/coqx7tfdhgzsd3kzw3ueczpktm7gxo5z67ciilgh24ffay7herao.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_67
triton_poi_fused_split_with_sizes_56 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_56(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 24084480
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 188160
x1 = (xindex // 188160)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (564480 + x0 + (752640*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_56.run(*args, 24084480, grid=grid(24084480), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_56.benchmark_all_configs(*args, 24084480, grid=grid(24084480))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/n6/cn6lwm2ninzuud2fuu3omrklbeewaxqtkv3v6gdaxmikjvyoi525.py
# Original ATen: aten.cat
# aten.cat => cat_5
triton_poi_fused_cat_57 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_57(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 6021120
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 47040
x1 = (xindex // 47040)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (188160*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 60, 28, 28), (47040, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 60, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_57.run(*args, 6021120, grid=grid(6021120), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_57.benchmark_all_configs(*args, 6021120, grid=grid(6021120))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/tm/ctmfxwnpwpks3pe3wok54jftmfykafzupq5buoissf32gfcrrq6d.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_53, add_54, add_55, convert_element_type_43, mul_72, mul_73, mul_74, mul_75, mul_76, rsqrt_10, squeeze_31, var_mean_10
triton_red_fused__native_batch_norm_legit_functional_58 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[256, 131072],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_58(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 240
rnumel = 100352
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 784
r2 = (rindex // 784)
tmp0 = tl.load(in_ptr0 + (r1 + (784*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tmp7 = tl.load(in_ptr1 + (x0), xmask)
tmp3 = 100352.0
tmp4 = tmp2 / tmp3
tmp5 = 0.1
tmp6 = tmp4 * tmp5
tmp8 = 0.9
tmp9 = tmp7 * tmp8
tmp10 = tmp6 + tmp9
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 784
r2 = (rindex // 784)
tmp11 = tl.load(in_ptr0 + (r1 + (784*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp12 = tmp11.to(tl.float32)
tmp13 = tmp12 - tmp4
tmp14 = tmp13 * tmp13
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
tmp15 = tl.sum(_tmp15, 1)[:, None]
tl.store(out_ptr1 + x0, tmp15, xmask)
tmp25 = tl.load(in_ptr2 + (x0), xmask)
tmp16 = 100352.0
tmp17 = tmp15 / tmp16
tmp18 = 1e-05
tmp19 = tmp17 + tmp18
tmp20 = tl.math.rsqrt(tmp19)
tmp21 = 1.00000996502277
tmp22 = tmp17 * tmp21
tmp23 = 0.1
tmp24 = tmp22 * tmp23
tmp26 = 0.9
tmp27 = tmp25 * tmp26
tmp28 = tmp24 + tmp27
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)
def get_args():
arg_0 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_58.run(*args, 240, 100352, grid=grid(240), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_58.benchmark_all_configs(*args, 240, 100352, grid=grid(240))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/tf/ctf52h6qjyfnbwd7tzysnt6o3dvwaedcis63siipfnbkcfkukqa7.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu
# aten._native_batch_norm_legit_functional => add_53, add_56, convert_element_type_43, convert_element_type_44, mul_71, mul_77, rsqrt_10, sub_10, var_mean_10
# aten.mean => mean
# aten.silu => convert_element_type_45, convert_element_type_46, mul_78, sigmoid_1
triton_per_fused__native_batch_norm_legit_functional_mean_silu_59 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[32768, 1024],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_mean_silu_59(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 30720
rnumel = 784
RBLOCK: tl.constexpr = 1024
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r2 = rindex
x3 = xindex
x0 = xindex % 240
tmp0 = tl.load(in_ptr0 + (r2 + (784*x3)), rmask, other=0).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x0), None)
tmp4 = tl.load(in_ptr2 + (x0), None)
tmp11 = tl.load(in_ptr3 + (x0), None)
tmp13 = tl.load(in_ptr4 + (x0), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 100352.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tmp15.to(tl.float32)
tmp17 = tl.sigmoid(tmp16)
tmp18 = tmp16 * tmp17
tmp19 = tmp18.to(tl.float32)
tmp20 = tmp19.to(tl.float32)
tmp22 = tl.where(rmask, tmp20, 0)
tmp23 = tl.sum(tmp22, 1)[:, None]
tmp24 = 784.0
tmp25 = tmp23 / tmp24
tmp26 = tmp25.to(tl.float32)
tl.store(out_ptr0 + (r2 + (784*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask)
tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None)
def get_args():
arg_0 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_mean_silu_59.run(*args, 30720, 784, grid=grid(30720), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_mean_silu_59.benchmark_all_configs(*args, 30720, 784, grid=grid(30720))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/l4/cl43m5yr7mr6rofdxkde5hoinowoes7zmspib6uw4wfhtoztyd2j.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_48
triton_poi_fused__to_copy_60 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_60(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4800
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((20, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((20, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_60.run(*args, 4800, grid=grid(4800), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_60.benchmark_all_configs(*args, 4800, grid=grid(4800))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/n3/cn3yrrvjz2rkdvv5cbgdyj5mk5wgjpljfxyga75evi5tqahwagdd.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_47
# aten.convolution => convolution_20
triton_poi_fused__to_copy_convolution_61 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_61(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 20
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((20,), (1,), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((20,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_61.run(*args, 20, grid=grid(20), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_61.benchmark_all_configs(*args, 20, grid=grid(20))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/nz/cnz5cjjblhni4cibgggtjl3kn4ksihmqz6znyn5belgi4q43cncw.py
# Original ATen: aten._to_copy, aten.convolution, aten.silu
# aten._to_copy => convert_element_type_47
# aten.convolution => convolution_20
# aten.silu => convert_element_type_49, convert_element_type_50, mul_79, sigmoid_2
triton_poi_fused__to_copy_convolution_silu_62 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_silu_62(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2560
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 20
tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp2 = tmp0 + tmp1
tmp3 = tmp2.to(tl.float32)
tmp4 = tl.sigmoid(tmp3)
tmp5 = tmp3 * tmp4
tmp6 = tmp5.to(tl.float32)
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, xmask)
def get_args():
arg_0 = rand_strided((128, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((20,), (1,), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_silu_62.run(*args, 2560, grid=grid(2560), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_silu_62.benchmark_all_configs(*args, 2560, grid=grid(2560))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/bl/cblr4zz2jdn5lx7fyq6hto375tcressl73jxrxvozhqbubkoilua.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_52
triton_poi_fused__to_copy_63 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_63(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4800
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((240, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((240, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_63.run(*args, 4800, grid=grid(4800), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_63.benchmark_all_configs(*args, 4800, grid=grid(4800))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/gq/cgqatezdistnqv4yyabd7t76wkyduditccoaqjd3ef4npoxk6vrv.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_51
# aten.convolution => convolution_21
triton_poi_fused__to_copy_convolution_64 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[256], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_64(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 240
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_64.run(*args, 240, grid=grid(240), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_64.benchmark_all_configs(*args, 240, grid=grid(240))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/qr/cqrlwq46vnra4itz3o4m3cvwrnyl7srsa7xlwywcqzni2rafclhf.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_51
# aten.convolution => convolution_21
triton_poi_fused__to_copy_convolution_65 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_65(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 30720
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 240
tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
tmp2 = tmp0 + tmp1
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None)
def get_args():
arg_0 = rand_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_65.run(*args, 30720, grid=grid(30720), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_65.benchmark_all_configs(*args, 30720, grid=grid(30720))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ru/crubekjujrhilkb2spkvdoiiitif5a5cisikyju6vjbn7j3itjum.py
# Original ATen: aten.mul, aten.sigmoid, aten.silu
# aten.mul => mul_80
# aten.sigmoid => sigmoid_3
# aten.silu => convert_element_type_45, convert_element_type_46, mul_78, sigmoid_1
triton_poi_fused_mul_sigmoid_silu_66 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused_mul_sigmoid_silu_66(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 24084480
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x1 = (xindex // 784)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tmp6 = tl.sigmoid(tmp5)
tmp7 = tmp4 * tmp6
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None)
def get_args():
arg_0 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_mul_sigmoid_silu_66.run(*args, 24084480, grid=grid(24084480), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_mul_sigmoid_silu_66.benchmark_all_configs(*args, 24084480, grid=grid(24084480))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/va/cvacncosevkh7a4e4mjytw5gvejhihlmvkwma42dwu2nfhypnbzr.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_53
triton_poi_fused__to_copy_67 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_67(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 13440
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((56, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((56, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_67.run(*args, 13440, grid=grid(13440), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_67.benchmark_all_configs(*args, 13440, grid=grid(13440))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/hv/chvkvxc6xaoyzdfyhbbbolrujmg4wuzt7gkh6x4thkny6u2hr4gg.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_54, var_mean_11
triton_red_fused__native_batch_norm_legit_functional_68 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_68(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 728
rnumel = 7720
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x1 = (xindex // 56)
x0 = xindex % 56
_tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = r2 + (7720*x1)
tmp1 = 100352
tmp2 = tmp0 < tmp1
tmp3 = tl.load(in_ptr0 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp4 = tmp3.to(tl.float32)
tmp5 = tl.where(tmp2, tmp4, 0)
_tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6)
tmp6 = tl.sum(_tmp6, 1)[:, None]
tl.store(out_ptr0 + x3, tmp6, xmask)
def get_args():
arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 56, 1, 1, 13), (728, 1, 728, 728, 56), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_68.run(*args, 728, 7720, grid=grid(728), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_68.benchmark_all_configs(*args, 728, 7720, grid=grid(728))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ih/cihgiw3fxfncw6wmlie2fnq4egrnxl6aq2aj37mt4r5kpoemk2yw.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_59, convert_element_type_54, mul_82, mul_83, var_mean_11
triton_per_fused__native_batch_norm_legit_functional_69 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[64, 16],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_69(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 56
rnumel = 13
RBLOCK: tl.constexpr = 16
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (56*r1)), rmask & xmask, other=0)
tmp8 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 100352.0
tmp5 = tmp3 / tmp4
tmp6 = 0.1
tmp7 = tmp5 * tmp6
tmp9 = 0.9
tmp10 = tmp8 * tmp9
tmp11 = tmp7 + tmp10
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask)
def get_args():
arg_0 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((1, 56, 1, 1, 13), (728, 1, 728, 728, 56), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_69.run(*args, 56, 13, grid=grid(56), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_69.benchmark_all_configs(*args, 56, 13, grid=grid(56))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/n3/cn3m3v5dwzlicmheht7t2sqzrzgtbufmioizlp6yemcka6kewuok.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_54, var_mean_11
triton_red_fused__native_batch_norm_legit_functional_70 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_70(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 728
rnumel = 7720
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x1 = (xindex // 56)
x0 = xindex % 56
_tmp9 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = r2 + (7720*x1)
tmp1 = 100352
tmp2 = tmp0 < tmp1
tmp3 = tl.load(in_ptr0 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp4 = tmp3.to(tl.float32)
tmp5 = tl.load(in_ptr1 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0)
tmp6 = tmp4 - tmp5
tmp7 = tmp6 * tmp6
tmp8 = tl.where(tmp2, tmp7, 0)
_tmp9 = tl.where(rmask & xmask, _tmp9 + tmp8, _tmp9)
tmp9 = tl.sum(_tmp9, 1)[:, None]
tl.store(out_ptr0 + x3, tmp9, xmask)
def get_args():
arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 56, 1, 1, 13), (728, 1, 728, 728, 56), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_70.run(*args, 728, 7720, grid=grid(728), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_70.benchmark_all_configs(*args, 728, 7720, grid=grid(728))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/xv/cxvq5i34k2bihobv4ebll2eevqva3mot7bow45ecoiyqxocbuesz.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_58, add_60, convert_element_type_54, mul_84, mul_85, mul_86, rsqrt_11, squeeze_34, var_mean_11
triton_per_fused__native_batch_norm_legit_functional_71 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[64, 16],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_71(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 56
rnumel = 13
RBLOCK: tl.constexpr = 16
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (56*r1)), rmask & xmask, other=0)
tmp13 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 100352.0
tmp5 = tmp3 / tmp4
tmp6 = 1e-05
tmp7 = tmp5 + tmp6
tmp8 = tl.math.rsqrt(tmp7)
tmp9 = 1.00000996502277
tmp10 = tmp5 * tmp9
tmp11 = 0.1
tmp12 = tmp10 * tmp11
tmp14 = 0.9
tmp15 = tmp13 * tmp14
tmp16 = tmp12 + tmp15
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask)
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask)
tl.store(out_ptr0 + x0, tmp3, xmask)
def get_args():
arg_0 = rand_strided((1, 56, 1, 1, 13), (728, 1, 728, 728, 56), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_71.run(*args, 56, 13, grid=grid(56), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_71.benchmark_all_configs(*args, 56, 13, grid=grid(56))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/jw/cjwfycxx2mrgx2fsw6tfls7yyigmo7hmavgpkr2ob23aopbzsnjc.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_58, add_61, convert_element_type_54, convert_element_type_55, mul_81, mul_87, rsqrt_11, sub_11, var_mean_11
triton_poi_fused__native_batch_norm_legit_functional_72 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_72(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 5619712
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 784) % 56
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 100352.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None)
def get_args():
arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_72.run(*args, 5619712, grid=grid(5619712), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_72.benchmark_all_configs(*args, 5619712, grid=grid(5619712))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/7e/c7eyndtglwpd6eg2g74cubgx2m45rn7sikd7mb4t4ucitntos24j.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_56
triton_poi_fused__to_copy_73 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_73(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4704
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_73.run(*args, 4704, grid=grid(4704), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_73.benchmark_all_configs(*args, 4704, grid=grid(4704))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/g7/cg7zj3gxcfnefxp4hvh24f2ahem4qdf4cwmaspxgokny23kr7rdc.py
# Original ATen: aten.cat
# aten.cat => cat_6
triton_poi_fused_cat_74 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_74(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16859136
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 131712
x1 = (xindex // 131712)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (263424*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 168, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_74.run(*args, 16859136, grid=grid(16859136), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_74.benchmark_all_configs(*args, 16859136, grid=grid(16859136))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ea/cea2p3bterhsxcizlwjuqgqdcdagptpjutjfoisqqtcxx4bo46r2.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_63, add_64, add_65, convert_element_type_58, mul_89, mul_90, mul_91, mul_92, mul_93, rsqrt_12, squeeze_37, var_mean_12
triton_red_fused__native_batch_norm_legit_functional_75 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 131072],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_75(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 336
rnumel = 100352
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 784
r2 = (rindex // 784)
tmp0 = tl.load(in_ptr0 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tmp7 = tl.load(in_ptr1 + (x0), xmask)
tmp3 = 100352.0
tmp4 = tmp2 / tmp3
tmp5 = 0.1
tmp6 = tmp4 * tmp5
tmp8 = 0.9
tmp9 = tmp7 * tmp8
tmp10 = tmp6 + tmp9
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 784
r2 = (rindex // 784)
tmp11 = tl.load(in_ptr0 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp12 = tmp11.to(tl.float32)
tmp13 = tmp12 - tmp4
tmp14 = tmp13 * tmp13
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
tmp15 = tl.sum(_tmp15, 1)[:, None]
tl.store(out_ptr1 + x0, tmp15, xmask)
tmp25 = tl.load(in_ptr2 + (x0), xmask)
tmp16 = 100352.0
tmp17 = tmp15 / tmp16
tmp18 = 1e-05
tmp19 = tmp17 + tmp18
tmp20 = tl.math.rsqrt(tmp19)
tmp21 = 1.00000996502277
tmp22 = tmp17 * tmp21
tmp23 = 0.1
tmp24 = tmp22 * tmp23
tmp26 = 0.9
tmp27 = tmp25 * tmp26
tmp28 = tmp24 + tmp27
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)
def get_args():
arg_0 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_75.run(*args, 336, 100352, grid=grid(336), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_75.benchmark_all_configs(*args, 336, 100352, grid=grid(336))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/3o/c3odnwpkqtnjm6osw5x7eu5soiuoiv27n2m7cqobgwum2f4c5rgk.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub
# aten._native_batch_norm_legit_functional => add_63, add_66, convert_element_type_58, convert_element_type_59, mul_88, mul_94, rsqrt_12, sub_12, var_mean_12
# aten.add => add_374
# aten.clone => clone_3
# aten.fill => full_like_44
# aten.mul => mul_1067, mul_1068
# aten.sigmoid => sigmoid_108
# aten.sub => sub_297
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
xnumel = 33718272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 784) % 336
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 100352.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tl.sigmoid(tmp15)
tmp17 = 1.0
tmp18 = tmp17 - tmp16
tmp19 = tmp15 * tmp18
tmp20 = tmp19 + tmp17
tmp21 = tmp16 * tmp20
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None)
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp21, None)
def get_args():
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76.run(*args, 33718272, grid=grid(33718272), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76.benchmark_all_configs(*args, 33718272, grid=grid(33718272))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/nw/cnwbbjysyivhwwej3cvtzavdj6jvinxuti5ndhcg3qzuf77glcrt.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_62
triton_poi_fused__to_copy_77 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_77(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1512
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((168, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((168, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_77.run(*args, 1512, grid=grid(1512), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_77.benchmark_all_configs(*args, 1512, grid=grid(1512))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/fz/cfzmnx3o626lf5s63xg32x6is4tmygw4nxdhdigpofuif52vxrzm.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_78
triton_poi_fused_split_with_sizes_78 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_78(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16859136
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 131712
x1 = (xindex // 131712)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (263424*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_78.run(*args, 16859136, grid=grid(16859136), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_78.benchmark_all_configs(*args, 16859136, grid=grid(16859136))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/2r/c2rv7mzu7my2opxvcm4z3utkh7llovacnk6pwuzns4ehc3sgey2e.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_63
triton_poi_fused__to_copy_79 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_79(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4200
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((168, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((168, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_79.run(*args, 4200, grid=grid(4200), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_79.benchmark_all_configs(*args, 4200, grid=grid(4200))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/pt/cptqmg7k6n2us7i5wywz2bwg6fjwqyaknishdkved5rkdpml5nw5.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_81
triton_poi_fused_split_with_sizes_80 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_80(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16859136
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 131712
x1 = (xindex // 131712)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (131712 + x0 + (263424*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_80.run(*args, 16859136, grid=grid(16859136), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_80.benchmark_all_configs(*args, 16859136, grid=grid(16859136))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/jj/cjj6wuzjfwt7xigucunb4hdetjt4mr4ldxzq5ab4kxddbetbjdu4.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu
# aten._native_batch_norm_legit_functional => add_68, add_71, convert_element_type_64, convert_element_type_65, mul_102, mul_96, rsqrt_13, sub_13, var_mean_13
# aten.mean => mean_1
# aten.silu => convert_element_type_66, convert_element_type_67, mul_103, sigmoid_5
triton_per_fused__native_batch_norm_legit_functional_mean_silu_81 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[65536, 1024],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_mean_silu_81(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 43008
rnumel = 784
RBLOCK: tl.constexpr = 1024
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r2 = rindex
x3 = xindex
x0 = xindex % 336
tmp0 = tl.load(in_ptr0 + (r2 + (784*x3)), rmask, other=0).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x0), None)
tmp4 = tl.load(in_ptr2 + (x0), None)
tmp11 = tl.load(in_ptr3 + (x0), None)
tmp13 = tl.load(in_ptr4 + (x0), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 100352.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tmp15.to(tl.float32)
tmp17 = tl.sigmoid(tmp16)
tmp18 = tmp16 * tmp17
tmp19 = tmp18.to(tl.float32)
tmp20 = tmp19.to(tl.float32)
tmp22 = tl.where(rmask, tmp20, 0)
tmp23 = tl.sum(tmp22, 1)[:, None]
tmp24 = 784.0
tmp25 = tmp23 / tmp24
tmp26 = tmp25.to(tl.float32)
tl.store(out_ptr0 + (r2 + (784*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask)
tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None)
def get_args():
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_mean_silu_81.run(*args, 43008, 784, grid=grid(43008), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_mean_silu_81.benchmark_all_configs(*args, 43008, 784, grid=grid(43008))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/qi/cqigsniw4k2557tsg6szy4sxc76yhclqkbczo3sje6z3gkbkxim2.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_69
triton_poi_fused__to_copy_82 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_82(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 9408
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((28, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((28, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_82.run(*args, 9408, grid=grid(9408), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_82.benchmark_all_configs(*args, 9408, grid=grid(9408))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/qr/cqrzuzcrjaui22653nognmjx7khvfhrb54plybvtrgjuggwhd7ca.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_68
# aten.convolution => convolution_27
triton_poi_fused__to_copy_convolution_83 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_83(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 28
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((28,), (1,), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((28,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_83.run(*args, 28, grid=grid(28), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_83.benchmark_all_configs(*args, 28, grid=grid(28))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/5j/c5jenet4bcjhmp6nmplvispvr7rwhrt4fonprb4atbj2oqpb6yhc.py
# Original ATen: aten._to_copy, aten.convolution, aten.silu
# aten._to_copy => convert_element_type_68
# aten.convolution => convolution_27
# aten.silu => convert_element_type_70, convert_element_type_71, mul_104, sigmoid_6
triton_poi_fused__to_copy_convolution_silu_84 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_silu_84(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3584
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 28
tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp2 = tmp0 + tmp1
tmp3 = tmp2.to(tl.float32)
tmp4 = tl.sigmoid(tmp3)
tmp5 = tmp3 * tmp4
tmp6 = tmp5.to(tl.float32)
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, xmask)
def get_args():
arg_0 = rand_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((28,), (1,), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_silu_84.run(*args, 3584, grid=grid(3584), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_silu_84.benchmark_all_configs(*args, 3584, grid=grid(3584))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/6q/c6qndeyorjgk72ipb7s7xexoyu3w64lakoglh4m45doekpaj4mk5.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_73
triton_poi_fused__to_copy_85 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_85(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 9408
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((336, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((336, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_85.run(*args, 9408, grid=grid(9408), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_85.benchmark_all_configs(*args, 9408, grid=grid(9408))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/e4/ce4gppgjp566e4o7bpfsqwx6ehxfdqd5kcqjzpnjkl7lictvy34j.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_72
# aten.convolution => convolution_28
triton_poi_fused__to_copy_convolution_86 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[512], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_86(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 336
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_86.run(*args, 336, grid=grid(336), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_86.benchmark_all_configs(*args, 336, grid=grid(336))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/6n/c6nqtrrohcuy7rc3a6jzkibcrswmbwqy32tjb2t44xhm5lv3c7ca.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_72
# aten.convolution => convolution_28
triton_poi_fused__to_copy_convolution_87 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_87(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 43008
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 336
tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
tmp2 = tmp0 + tmp1
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None)
def get_args():
arg_0 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_87.run(*args, 43008, grid=grid(43008), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_87.benchmark_all_configs(*args, 43008, grid=grid(43008))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/tg/ctg6frnekhgugkbeugokpbetskj6nj7dyov4tmhfv6akngrmr3zz.py
# Original ATen: aten.mul, aten.sigmoid, aten.silu
# aten.mul => mul_105
# aten.sigmoid => sigmoid_7
# aten.silu => convert_element_type_66, convert_element_type_67, mul_103, sigmoid_5
triton_poi_fused_mul_sigmoid_silu_88 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused_mul_sigmoid_silu_88(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 33718272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x1 = (xindex // 784)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tmp6 = tl.sigmoid(tmp5)
tmp7 = tmp4 * tmp6
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None)
def get_args():
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_mul_sigmoid_silu_88.run(*args, 33718272, grid=grid(33718272), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_mul_sigmoid_silu_88.benchmark_all_configs(*args, 33718272, grid=grid(33718272))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/uj/cujd4wpttqmud7zdg64vlcs4ef2kzahlsvrue6dsbdgqyeobae26.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_84
triton_poi_fused_split_with_sizes_89 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_89(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16859136
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 131712
x1 = (xindex // 131712)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (263424*x1)), None).to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_89.run(*args, 16859136, grid=grid(16859136), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_89.benchmark_all_configs(*args, 16859136, grid=grid(16859136))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/f3/cf3fhfcwg6xxca6ynhdijefpawpp34qudotfwxc7l4ytcq3zjpuo.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_85
triton_poi_fused_split_with_sizes_90 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_90(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16859136
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 131712
x1 = (xindex // 131712)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (131712 + x0 + (263424*x1)), None).to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_90.run(*args, 16859136, grid=grid(16859136), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_90.benchmark_all_configs(*args, 16859136, grid=grid(16859136))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/mu/cmu55cboklppuar2bfoeffop2b46e55c3whe4q3ppvxcrz2secws.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_74
triton_poi_fused__to_copy_91 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_91(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4704
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_91.run(*args, 4704, grid=grid(4704), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_91.benchmark_all_configs(*args, 4704, grid=grid(4704))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/p7/cp7uk2n6iplc4okgdyrcwjh6klja4hwa7hip53lwkxv2iafhns22.py
# Original ATen: aten.cat
# aten.cat => cat_8
triton_poi_fused_cat_92 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_92(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2809856
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 21952
x1 = (xindex // 21952)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (43904*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 28, 28, 28), (21952, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 28, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_92.run(*args, 2809856, grid=grid(2809856), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_92.benchmark_all_configs(*args, 2809856, grid=grid(2809856))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/go/cgo47t53ri3q3acehvndov22nzujsisq6mopaip7tcxdzexe45pe.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add
# aten._native_batch_norm_legit_functional => add_73, add_76, convert_element_type_76, convert_element_type_77, mul_106, mul_112, rsqrt_14, sub_14, var_mean_14
# aten.add => add_77
triton_poi_fused__native_batch_norm_legit_functional_add_93 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_93(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 5619712
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 784) % 56
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp16 = tl.load(in_ptr5 + (x3), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 100352.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp17 = tmp15 + tmp16
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp17, None)
def get_args():
arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_93.run(*args, 5619712, grid=grid(5619712), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_93.benchmark_all_configs(*args, 5619712, grid=grid(5619712))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/cm/ccmet4jdw3a22ehivtutado4ilowri7xe4wvz55tt3jrgkttrcxr.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_122
triton_poi_fused__to_copy_94 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_94(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 18816
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((336, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((336, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_94.run(*args, 18816, grid=grid(18816), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_94.benchmark_all_configs(*args, 18816, grid=grid(18816))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/2z/c2zspsupbpoyi6v25rww6qizjecactzymkuzhmgwbksaajrrvebj.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_127
triton_poi_fused__to_copy_95 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_95(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1008
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((112, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((112, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_95.run(*args, 1008, grid=grid(1008), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_95.benchmark_all_configs(*args, 1008, grid=grid(1008))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/vt/cvthhjpoxq63zz4mggqx2vqnhtco6vgp67jmycwwem5t4nuguc3d.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_125
triton_poi_fused_split_with_sizes_96 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_96(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 11239424
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 87808
x1 = (xindex // 87808)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (263424*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 112, 28, 28), (87808, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_96.run(*args, 11239424, grid=grid(11239424), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_96.benchmark_all_configs(*args, 11239424, grid=grid(11239424))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/67/c67f43otxwovlhoxcmgghbvuamebcise6w4hyt6kxev6rjigyz6t.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_128
triton_poi_fused__to_copy_97 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_97(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2800
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((112, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((112, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_97.run(*args, 2800, grid=grid(2800), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_97.benchmark_all_configs(*args, 2800, grid=grid(2800))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/mp/cmpup3qxitd5cnywi6sc5ctsik3jls3ax4u6bwwg7l22muv3kqac.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_129
triton_poi_fused_split_with_sizes_98 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_98(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 11239424
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 87808
x1 = (xindex // 87808)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (87808 + x0 + (263424*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 112, 28, 28), (87808, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_98.run(*args, 11239424, grid=grid(11239424), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_98.benchmark_all_configs(*args, 11239424, grid=grid(11239424))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/hj/chjntoqpyuayr43gwoduaxa3dcaerfocp4evoj27g5z63hwjt7yd.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_129
triton_poi_fused__to_copy_99 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_99(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 5488
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((112, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((112, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_99.run(*args, 5488, grid=grid(5488), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_99.benchmark_all_configs(*args, 5488, grid=grid(5488))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/5y/c5yntu5zez2ni5e2qpl6ynioug5zthjmijm3yvywigtt4blzf6l6.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_133
triton_poi_fused_split_with_sizes_100 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_100(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 11239424
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 87808
x1 = (xindex // 87808)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (175616 + x0 + (263424*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 112, 28, 28), (87808, 784, 28, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_100.run(*args, 11239424, grid=grid(11239424), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_100.benchmark_all_configs(*args, 11239424, grid=grid(11239424))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/hi/chiefsb3dple2qr2zd5ph5efd2rhpwu722ib7qhnwdnwgxusl6o7.py
# Original ATen: aten.cat
# aten.cat => cat_15
triton_poi_fused_cat_101 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_101(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2809856
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 21952
x1 = (xindex // 21952)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (65856*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 112, 14, 14), (21952, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 112, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_101.run(*args, 2809856, grid=grid(2809856), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_101.benchmark_all_configs(*args, 2809856, grid=grid(2809856))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/hj/chj56tyq633bknygfo23skutbtotcilbxvxls3o3esvqkeosnqzu.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_116, add_117, add_118, convert_element_type_130, mul_172, mul_173, mul_174, mul_175, mul_176, rsqrt_22, squeeze_67, var_mean_22
triton_red_fused__native_batch_norm_legit_functional_102 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 32768],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_102(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 336
rnumel = 25088
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 196
r2 = (rindex // 196)
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (65856*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tmp7 = tl.load(in_ptr1 + (x0), xmask)
tmp3 = 25088.0
tmp4 = tmp2 / tmp3
tmp5 = 0.1
tmp6 = tmp4 * tmp5
tmp8 = 0.9
tmp9 = tmp7 * tmp8
tmp10 = tmp6 + tmp9
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 196
r2 = (rindex // 196)
tmp11 = tl.load(in_ptr0 + (r1 + (196*x0) + (65856*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp12 = tmp11.to(tl.float32)
tmp13 = tmp12 - tmp4
tmp14 = tmp13 * tmp13
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
tmp15 = tl.sum(_tmp15, 1)[:, None]
tl.store(out_ptr1 + x0, tmp15, xmask)
tmp25 = tl.load(in_ptr2 + (x0), xmask)
tmp16 = 25088.0
tmp17 = tmp15 / tmp16
tmp18 = 1e-05
tmp19 = tmp17 + tmp18
tmp20 = tl.math.rsqrt(tmp19)
tmp21 = 1.0000398612827361
tmp22 = tmp17 * tmp21
tmp23 = 0.1
tmp24 = tmp22 * tmp23
tmp26 = 0.9
tmp27 = tmp25 * tmp26
tmp28 = tmp24 + tmp27
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)
def get_args():
arg_0 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_102.run(*args, 336, 25088, grid=grid(336), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_102.benchmark_all_configs(*args, 336, 25088, grid=grid(336))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/zj/czji7shyj4kk6bl2rcdf5euq7ku4vpm3paub7sdz5rtqhc3o2xxw.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu
# aten._native_batch_norm_legit_functional => add_116, add_119, convert_element_type_130, convert_element_type_131, mul_171, mul_177, rsqrt_22, sub_22, var_mean_22
# aten.mean => mean_4
# aten.silu => convert_element_type_132, convert_element_type_133, mul_178, sigmoid_17
triton_per_fused__native_batch_norm_legit_functional_mean_silu_103 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[65536, 256],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_mean_silu_103(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 43008
rnumel = 196
RBLOCK: tl.constexpr = 256
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r2 = rindex
x3 = xindex
x0 = xindex % 336
tmp0 = tl.load(in_ptr0 + (r2 + (196*x3)), rmask, other=0).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x0), None)
tmp4 = tl.load(in_ptr2 + (x0), None)
tmp11 = tl.load(in_ptr3 + (x0), None)
tmp13 = tl.load(in_ptr4 + (x0), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 25088.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tmp15.to(tl.float32)
tmp17 = tl.sigmoid(tmp16)
tmp18 = tmp16 * tmp17
tmp19 = tmp18.to(tl.float32)
tmp20 = tmp19.to(tl.float32)
tmp22 = tl.where(rmask, tmp20, 0)
tmp23 = tl.sum(tmp22, 1)[:, None]
tmp24 = 196.0
tmp25 = tmp23 / tmp24
tmp26 = tmp25.to(tl.float32)
tl.store(out_ptr0 + (r2 + (196*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask)
tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None)
def get_args():
arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_mean_silu_103.run(*args, 43008, 196, grid=grid(43008), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_mean_silu_103.benchmark_all_configs(*args, 43008, 196, grid=grid(43008))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/vd/cvdkhtrpiolmsts2zro46mftpapkihzk36p6zhnrps3p4cq3krmr.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_135
triton_poi_fused__to_copy_104 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_104(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4704
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((14, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((14, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_104.run(*args, 4704, grid=grid(4704), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_104.benchmark_all_configs(*args, 4704, grid=grid(4704))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/tr/ctr74i6h7fbsohivhhoxxpx2ftnqmpxexofhzvpjxs3qwsbp6lt2.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_134
# aten.convolution => convolution_51
triton_poi_fused__to_copy_convolution_105 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_105(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 14
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((14,), (1,), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((14,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_105.run(*args, 14, grid=grid(14), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_105.benchmark_all_configs(*args, 14, grid=grid(14))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ko/ckojvusnljgriux3ampytpmwhilvjdtesa7avk7kubgd6p562p34.py
# Original ATen: aten._to_copy, aten.convolution, aten.silu
# aten._to_copy => convert_element_type_134
# aten.convolution => convolution_51
# aten.silu => convert_element_type_136, convert_element_type_137, mul_179, sigmoid_18
triton_poi_fused__to_copy_convolution_silu_106 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_silu_106(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1792
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 14
tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp2 = tmp0 + tmp1
tmp3 = tmp2.to(tl.float32)
tmp4 = tl.sigmoid(tmp3)
tmp5 = tmp3 * tmp4
tmp6 = tmp5.to(tl.float32)
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, xmask)
def get_args():
arg_0 = rand_strided((128, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((14,), (1,), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_silu_106.run(*args, 1792, grid=grid(1792), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_silu_106.benchmark_all_configs(*args, 1792, grid=grid(1792))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/wd/cwdr2zuaja73on5ahrizfe4fw5rmvg66qvnjezuugxyt6mvgktuo.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_139
triton_poi_fused__to_copy_107 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_107(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4704
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((336, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((336, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_107.run(*args, 4704, grid=grid(4704), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_107.benchmark_all_configs(*args, 4704, grid=grid(4704))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/i4/ci4cud6xszhpzrrtirjqfbkilqlzpxjq4f4rcbjmyxesmegdihcj.py
# Original ATen: aten.mul, aten.sigmoid, aten.silu
# aten.mul => mul_180
# aten.sigmoid => sigmoid_19
# aten.silu => convert_element_type_132, convert_element_type_133, mul_178, sigmoid_17
triton_poi_fused_mul_sigmoid_silu_108 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused_mul_sigmoid_silu_108(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 8429568
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x1 = (xindex // 196)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tmp6 = tl.sigmoid(tmp5)
tmp7 = tmp4 * tmp6
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None)
def get_args():
arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_mul_sigmoid_silu_108.run(*args, 8429568, grid=grid(8429568), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_mul_sigmoid_silu_108.benchmark_all_configs(*args, 8429568, grid=grid(8429568))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/am/camchutakzc7ufjmewydlcbxzwspii4jktvvytxzvdtkijmunoki.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_140
triton_poi_fused__to_copy_109 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_109(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 34944
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((104, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((104, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_109.run(*args, 34944, grid=grid(34944), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_109.benchmark_all_configs(*args, 34944, grid=grid(34944))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/yx/cyxdjlki3rk4s5ktbskifaxqivcqtfxhzou2fgeig544h2budbbn.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_141, var_mean_23
triton_red_fused__native_batch_norm_legit_functional_110 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_110(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 416
rnumel = 6272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex % 104
x1 = (xindex // 104)
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = tl.load(in_ptr0 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tl.store(out_ptr0 + x3, tmp2, xmask)
def get_args():
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 104, 1, 1, 4), (416, 1, 416, 416, 104), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_110.run(*args, 416, 6272, grid=grid(416), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_110.benchmark_all_configs(*args, 416, 6272, grid=grid(416))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ns/cnsyz4u2t4xdpizytxz7kuybbnglpm544k5kmwi5e6mw37eoeeyp.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_122, convert_element_type_141, mul_182, mul_183, var_mean_23
triton_per_fused__native_batch_norm_legit_functional_111 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[128, 4],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_111(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 104
rnumel = 4
RBLOCK: tl.constexpr = 4
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (104*r1)), rmask & xmask, other=0)
tmp8 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 25088.0
tmp5 = tmp3 / tmp4
tmp6 = 0.1
tmp7 = tmp5 * tmp6
tmp9 = 0.9
tmp10 = tmp8 * tmp9
tmp11 = tmp7 + tmp10
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask)
def get_args():
arg_0 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((1, 104, 1, 1, 4), (416, 1, 416, 416, 104), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_111.run(*args, 104, 4, grid=grid(104), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_111.benchmark_all_configs(*args, 104, 4, grid=grid(104))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/vx/cvx6izhz5yqxyjo6cfzkguw47brvifvvxn3jp6fmmqpamdr6rdo7.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_141, var_mean_23
triton_red_fused__native_batch_norm_legit_functional_112 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_112(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 416
rnumel = 6272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex % 104
x1 = (xindex // 104)
tmp2 = tl.load(in_ptr1 + (x0), xmask)
_tmp5 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = tl.load(in_ptr0 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp4 = tmp3 * tmp3
_tmp5 = tl.where(rmask & xmask, _tmp5 + tmp4, _tmp5)
tmp5 = tl.sum(_tmp5, 1)[:, None]
tl.store(out_ptr0 + x3, tmp5, xmask)
def get_args():
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 104, 1, 1, 4), (416, 1, 416, 416, 104), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_112.run(*args, 416, 6272, grid=grid(416), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_112.benchmark_all_configs(*args, 416, 6272, grid=grid(416))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/cy/ccy7dhumtl6nx7vjrgetby6qbjqdcsh4fen2e264olgvvilit6pl.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_121, add_123, convert_element_type_141, mul_184, mul_185, mul_186, rsqrt_23, squeeze_70, var_mean_23
triton_per_fused__native_batch_norm_legit_functional_113 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[128, 4],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_113(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 104
rnumel = 4
RBLOCK: tl.constexpr = 4
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (104*r1)), rmask & xmask, other=0)
tmp13 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 25088.0
tmp5 = tmp3 / tmp4
tmp6 = 1e-05
tmp7 = tmp5 + tmp6
tmp8 = tl.math.rsqrt(tmp7)
tmp9 = 1.0000398612827361
tmp10 = tmp5 * tmp9
tmp11 = 0.1
tmp12 = tmp10 * tmp11
tmp14 = 0.9
tmp15 = tmp13 * tmp14
tmp16 = tmp12 + tmp15
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask)
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask)
tl.store(out_ptr0 + x0, tmp3, xmask)
def get_args():
arg_0 = rand_strided((1, 104, 1, 1, 4), (416, 1, 416, 416, 104), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_113.run(*args, 104, 4, grid=grid(104), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_113.benchmark_all_configs(*args, 104, 4, grid=grid(104))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/xg/cxgqusw3xzpsm5ih6qyxh7qdzmvcnjj2evihvlzncrorcyi7altv.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_121, add_124, convert_element_type_141, convert_element_type_142, mul_181, mul_187, rsqrt_23, sub_23, var_mean_23
triton_poi_fused__native_batch_norm_legit_functional_114 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_114(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2609152
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 196) % 104
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 25088.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None)
def get_args():
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_114.run(*args, 2609152, grid=grid(2609152), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_114.benchmark_all_configs(*args, 2609152, grid=grid(2609152))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/hr/chrygikuwep25dgozsipotfnih6lajyye2txuo4r7put7ywkdk3t.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_143
triton_poi_fused__to_copy_115 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_115(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16224
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_115.run(*args, 16224, grid=grid(16224), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_115.benchmark_all_configs(*args, 16224, grid=grid(16224))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ca/ccaahgmwlqiraz4ivxpktecmbqhvntosz4cdfe5ghfjm4pe6pws6.py
# Original ATen: aten.cat
# aten.cat => cat_16
triton_poi_fused_cat_116 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_116(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 7827456
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 61152
x1 = (xindex // 61152)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (122304*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 312, 14, 14), (61152, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 312, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_116.run(*args, 7827456, grid=grid(7827456), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_116.benchmark_all_configs(*args, 7827456, grid=grid(7827456))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/j3/cj3tf4apq2n25yzonidfbwyd3cn3jiyqodrw5hjo73xovhn5czvu.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_126, add_127, add_128, convert_element_type_145, mul_189, mul_190, mul_191, mul_192, mul_193, rsqrt_24, squeeze_73, var_mean_24
triton_red_fused__native_batch_norm_legit_functional_117 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 32768],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_117(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 624
rnumel = 25088
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 196
r2 = (rindex // 196)
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tmp7 = tl.load(in_ptr1 + (x0), xmask)
tmp3 = 25088.0
tmp4 = tmp2 / tmp3
tmp5 = 0.1
tmp6 = tmp4 * tmp5
tmp8 = 0.9
tmp9 = tmp7 * tmp8
tmp10 = tmp6 + tmp9
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 196
r2 = (rindex // 196)
tmp11 = tl.load(in_ptr0 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp12 = tmp11.to(tl.float32)
tmp13 = tmp12 - tmp4
tmp14 = tmp13 * tmp13
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
tmp15 = tl.sum(_tmp15, 1)[:, None]
tl.store(out_ptr1 + x0, tmp15, xmask)
tmp25 = tl.load(in_ptr2 + (x0), xmask)
tmp16 = 25088.0
tmp17 = tmp15 / tmp16
tmp18 = 1e-05
tmp19 = tmp17 + tmp18
tmp20 = tl.math.rsqrt(tmp19)
tmp21 = 1.0000398612827361
tmp22 = tmp17 * tmp21
tmp23 = 0.1
tmp24 = tmp22 * tmp23
tmp26 = 0.9
tmp27 = tmp25 * tmp26
tmp28 = tmp24 + tmp27
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)
def get_args():
arg_0 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_117.run(*args, 624, 25088, grid=grid(624), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_117.benchmark_all_configs(*args, 624, 25088, grid=grid(624))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/yk/cyk2njvgthyqkaax3hrogqqjnq7t4z4h2bdza6wc2nyqhlzwiqur.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub
# aten._native_batch_norm_legit_functional => add_126, add_129, convert_element_type_145, convert_element_type_146, mul_188, mul_194, rsqrt_24, sub_24, var_mean_24
# aten.add => add_355
# aten.clone => clone_15
# aten.fill => full_like_32
# aten.mul => mul_907, mul_908
# aten.sigmoid => sigmoid_96
# aten.sub => sub_233
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_118 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_118(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
xnumel = 15654912
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 196) % 624
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 25088.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tl.sigmoid(tmp15)
tmp17 = 1.0
tmp18 = tmp17 - tmp16
tmp19 = tmp15 * tmp18
tmp20 = tmp19 + tmp17
tmp21 = tmp16 * tmp20
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None)
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp21, None)
def get_args():
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_118.run(*args, 15654912, grid=grid(15654912), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_118.benchmark_all_configs(*args, 15654912, grid=grid(15654912))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/h4/ch4fzflvelcyupsdemcjv4coogzqeqjolfo3rcegkby4xe2z7cgp.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_149
triton_poi_fused__to_copy_119 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_119(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1404
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((156, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((156, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_119.run(*args, 1404, grid=grid(1404), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_119.benchmark_all_configs(*args, 1404, grid=grid(1404))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ee/ceehri7sldlkmfyc53ikeryr64g77qkjawr2ddxdwntxstm4glbe.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_146
triton_poi_fused_split_with_sizes_120 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_120(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3913728
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 30576
x1 = (xindex // 30576)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (122304*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_120.run(*args, 3913728, grid=grid(3913728), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_120.benchmark_all_configs(*args, 3913728, grid=grid(3913728))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/3q/c3qy34jaqk4nnzsuou7s5dwcbuoi5kctwalhcspkfthqj2nvhjzq.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_150
triton_poi_fused__to_copy_121 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_121(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3900
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((156, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((156, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_121.run(*args, 3900, grid=grid(3900), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_121.benchmark_all_configs(*args, 3900, grid=grid(3900))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/d2/cd2noiyayhlfce4vrflvnr5n745imwlcczcywnnle3qtso27avlv.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_151
triton_poi_fused_split_with_sizes_122 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_122(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3913728
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 30576
x1 = (xindex // 30576)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (30576 + x0 + (122304*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_122.run(*args, 3913728, grid=grid(3913728), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_122.benchmark_all_configs(*args, 3913728, grid=grid(3913728))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/on/conrh3mxhtsmg5nvtksxdme55y6bhfvwrrnpp6mhmwmulosn5uyv.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_151
triton_poi_fused__to_copy_123 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_123(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 7644
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((156, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((156, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_123.run(*args, 7644, grid=grid(7644), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_123.benchmark_all_configs(*args, 7644, grid=grid(7644))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/nc/cncwy3tqtyzgd3muyfflcfni7qls4ttqyb4sac6cmg6znll2bcqu.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_156
triton_poi_fused_split_with_sizes_124 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_124(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3913728
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 30576
x1 = (xindex // 30576)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (61152 + x0 + (122304*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_124.run(*args, 3913728, grid=grid(3913728), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_124.benchmark_all_configs(*args, 3913728, grid=grid(3913728))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/yy/cyyxm3mybqiv5z5gofz4uhjttc6in5hwbrcbzt4xlp4z5lqyiqvr.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_152
triton_poi_fused__to_copy_125 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_125(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 12636
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((156, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((156, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_125.run(*args, 12636, grid=grid(12636), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_125.benchmark_all_configs(*args, 12636, grid=grid(12636))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ta/ctar2nhvp2hnx6l3xel6gxwicmmw5nlhbt33l63xfadahaihh7qo.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_161
triton_poi_fused_split_with_sizes_126 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_126(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3913728
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 30576
x1 = (xindex // 30576)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (91728 + x0 + (122304*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_126.run(*args, 3913728, grid=grid(3913728), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_126.benchmark_all_configs(*args, 3913728, grid=grid(3913728))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/aa/caaej5ipewhg7u3hng5mw3omnmn72k6eam7aoq4mbk5yqdeq7wwb.py
# Original ATen: aten.cat
# aten.cat => cat_17
triton_poi_fused_cat_127 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_127(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3913728
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 30576
x1 = (xindex // 30576)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (122304*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 156, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_127.run(*args, 3913728, grid=grid(3913728), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_127.benchmark_all_configs(*args, 3913728, grid=grid(3913728))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/xm/cxmjpqggq2cokpfycochmecokolhxsamegk6vtmem3x47oz5ss3r.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu
# aten._native_batch_norm_legit_functional => add_131, add_134, convert_element_type_153, convert_element_type_154, mul_196, mul_202, rsqrt_25, sub_25, var_mean_25
# aten.mean => mean_5
# aten.silu => convert_element_type_155, convert_element_type_156, mul_203, sigmoid_21
triton_per_fused__native_batch_norm_legit_functional_mean_silu_128 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[131072, 256],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_mean_silu_128(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 79872
rnumel = 196
RBLOCK: tl.constexpr = 256
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r2 = rindex
x3 = xindex
x0 = xindex % 624
tmp0 = tl.load(in_ptr0 + (r2 + (196*x3)), rmask, other=0).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x0), None)
tmp4 = tl.load(in_ptr2 + (x0), None)
tmp11 = tl.load(in_ptr3 + (x0), None)
tmp13 = tl.load(in_ptr4 + (x0), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 25088.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tmp15.to(tl.float32)
tmp17 = tl.sigmoid(tmp16)
tmp18 = tmp16 * tmp17
tmp19 = tmp18.to(tl.float32)
tmp20 = tmp19.to(tl.float32)
tmp22 = tl.where(rmask, tmp20, 0)
tmp23 = tl.sum(tmp22, 1)[:, None]
tmp24 = 196.0
tmp25 = tmp23 / tmp24
tmp26 = tmp25.to(tl.float32)
tl.store(out_ptr0 + (r2 + (196*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask)
tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None)
def get_args():
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_mean_silu_128.run(*args, 79872, 196, grid=grid(79872), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_mean_silu_128.benchmark_all_configs(*args, 79872, 196, grid=grid(79872))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/5n/c5nuz3wivd2kux327s5ffdifgf3i65ukistqypmzli4ipcs5fhup.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_158
triton_poi_fused__to_copy_129 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_129(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16224
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((26, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((26, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_129.run(*args, 16224, grid=grid(16224), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_129.benchmark_all_configs(*args, 16224, grid=grid(16224))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/7u/c7uwxkw3ipfjfdehvzykp4imfsjz53vfoikfvt4ys7uwsknqbbb2.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_157
# aten.convolution => convolution_60
triton_poi_fused__to_copy_convolution_130 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_130(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 26
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((26,), (1,), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((26,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_130.run(*args, 26, grid=grid(26), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_130.benchmark_all_configs(*args, 26, grid=grid(26))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/qj/cqjs4oas7ss654pge5hzaweuxapq6ge5y5fvo2ygxf7r6qylld6m.py
# Original ATen: aten._to_copy, aten.convolution, aten.silu
# aten._to_copy => convert_element_type_157
# aten.convolution => convolution_60
# aten.silu => convert_element_type_159, convert_element_type_160, mul_204, sigmoid_22
triton_poi_fused__to_copy_convolution_silu_131 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_silu_131(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3328
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 26
tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp2 = tmp0 + tmp1
tmp3 = tmp2.to(tl.float32)
tmp4 = tl.sigmoid(tmp3)
tmp5 = tmp3 * tmp4
tmp6 = tmp5.to(tl.float32)
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, xmask)
def get_args():
arg_0 = rand_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((26,), (1,), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_silu_131.run(*args, 3328, grid=grid(3328), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_silu_131.benchmark_all_configs(*args, 3328, grid=grid(3328))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/af/cafnwdqexuqbr3u7uonjmdp2h6gms65ppr5ersr7levxxzbxsags.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_162
triton_poi_fused__to_copy_132 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_132(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16224
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((624, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((624, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_132.run(*args, 16224, grid=grid(16224), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_132.benchmark_all_configs(*args, 16224, grid=grid(16224))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/qt/cqtzzxfgsd6elacosxssusc2qroqxqbum3szv62gh4gr6s5vzwmp.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_161
# aten.convolution => convolution_61
triton_poi_fused__to_copy_convolution_133 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_133(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 624
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_133.run(*args, 624, grid=grid(624), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_133.benchmark_all_configs(*args, 624, grid=grid(624))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/hd/chd7xmae5qozjrly7porkx7hggrm6fh6klnsko3rzc52ohjqiyou.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_161
# aten.convolution => convolution_61
triton_poi_fused__to_copy_convolution_134 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_134(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 79872
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 624
tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
tmp2 = tmp0 + tmp1
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None)
def get_args():
arg_0 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_134.run(*args, 79872, grid=grid(79872), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_134.benchmark_all_configs(*args, 79872, grid=grid(79872))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/mn/cmnvsfbxocmtd6hcxhi5n75ltfyzsnegxzpgv7o57n6ksfhmrkay.py
# Original ATen: aten.mul, aten.sigmoid, aten.silu
# aten.mul => mul_205
# aten.sigmoid => sigmoid_23
# aten.silu => convert_element_type_155, convert_element_type_156, mul_203, sigmoid_21
triton_poi_fused_mul_sigmoid_silu_135 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused_mul_sigmoid_silu_135(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 15654912
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x1 = (xindex // 196)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tmp6 = tl.sigmoid(tmp5)
tmp7 = tmp4 * tmp6
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None)
def get_args():
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_mul_sigmoid_silu_135.run(*args, 15654912, grid=grid(15654912), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_mul_sigmoid_silu_135.benchmark_all_configs(*args, 15654912, grid=grid(15654912))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/x6/cx65e4iuztiipxmj3kt6pdmtbqk6eud6jbchpllpc6tinjqjiko7.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_164
triton_poi_fused_split_with_sizes_136 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_136(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 7827456
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 61152
x1 = (xindex // 61152)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (122304*x1)), None).to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 312, 14, 14), (61152, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_136.run(*args, 7827456, grid=grid(7827456), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_136.benchmark_all_configs(*args, 7827456, grid=grid(7827456))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/la/cla6pjsd7zhhqofhjak5ak67vzu4puphrosszybe7he2qayusdf7.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_165
triton_poi_fused_split_with_sizes_137 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_137(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 7827456
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 61152
x1 = (xindex // 61152)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (61152 + x0 + (122304*x1)), None).to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 312, 14, 14), (61152, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_137.run(*args, 7827456, grid=grid(7827456), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_137.benchmark_all_configs(*args, 7827456, grid=grid(7827456))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/hz/chz3fw3o5fqe2cnzuiuggnyrnh3xzhhrw266gsgpwo3xgo4buev6.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_163
triton_poi_fused__to_copy_138 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_138(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 16224
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_138.run(*args, 16224, grid=grid(16224), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_138.benchmark_all_configs(*args, 16224, grid=grid(16224))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/vk/cvkwpn3gas36vom5boko6m7yjnsu4h3jn3s2dj74g4puyptio4dc.py
# Original ATen: aten.cat
# aten.cat => cat_18
triton_poi_fused_cat_139 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_139(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1304576
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 10192
x1 = (xindex // 10192)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (20384*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 52, 14, 14), (10192, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 52, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_139.run(*args, 1304576, grid=grid(1304576), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_139.benchmark_all_configs(*args, 1304576, grid=grid(1304576))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/qp/cqptss63kr7w22hqbaitci52teddi4haqxu6zws6bzlk7k3r4p5w.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add
# aten._native_batch_norm_legit_functional => add_136, add_139, convert_element_type_165, convert_element_type_166, mul_206, mul_212, rsqrt_26, sub_26, var_mean_26
# aten.add => add_140
triton_poi_fused__native_batch_norm_legit_functional_add_140 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_140(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2609152
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 196) % 104
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp16 = tl.load(in_ptr5 + (x3), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 25088.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp17 = tmp15 + tmp16
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp17, None)
def get_args():
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_140.run(*args, 2609152, grid=grid(2609152), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_140.benchmark_all_configs(*args, 2609152, grid=grid(2609152))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/zf/czfuu6dquzuudpwuu3bwtsd76hxju2ps43qrqh5npmmaayzjihi2.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_215
triton_poi_fused__to_copy_141 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_141(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 64896
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((624, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((624, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_141.run(*args, 64896, grid=grid(64896), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_141.benchmark_all_configs(*args, 64896, grid=grid(64896))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/hj/chjvw2ozl3ipjwmejxog5ggykkaa6pfst7j432f3sbhnt2kvth4p.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.silu, aten.sub
# aten._native_batch_norm_legit_functional => add_174, add_177, convert_element_type_216, convert_element_type_217, mul_263, mul_269, rsqrt_33, sub_33, var_mean_33
# aten.add => add_341
# aten.clone => clone_24
# aten.fill => full_like_23
# aten.mul => mul_787, mul_788
# aten.sigmoid => sigmoid_87
# aten.silu => convert_element_type_218, convert_element_type_219, mul_270, sigmoid_32
# aten.sub => sub_185
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_silu_sub_142 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_silu_sub_142(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr1, out_ptr2, xnumel, XBLOCK : tl.constexpr):
xnumel = 15654912
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 196) % 624
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 25088.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tmp15.to(tl.float32)
tmp17 = tl.sigmoid(tmp16)
tmp18 = tmp16 * tmp17
tmp19 = tmp18.to(tl.float32)
tmp20 = tl.sigmoid(tmp15)
tmp21 = 1.0
tmp22 = tmp21 - tmp20
tmp23 = tmp15 * tmp22
tmp24 = tmp23 + tmp21
tmp25 = tmp20 * tmp24
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp19, None)
tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp25, None)
def get_args():
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_silu_sub_142.run(*args, 15654912, grid=grid(15654912), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_silu_sub_142.benchmark_all_configs(*args, 15654912, grid=grid(15654912))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/by/cbyy7pyyeky2s6spdugmwrwqdvrjfdr7qwcxwapwzhywwhng5ujh.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_220
triton_poi_fused__to_copy_143 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_143(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 5616
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((624, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((624, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_143.run(*args, 5616, grid=grid(5616), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_143.benchmark_all_configs(*args, 5616, grid=grid(5616))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ji/cjiljcgh2wc2yighvqz76tkwzti3zjjxpno5dng2ammsgvfhwvv7.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_226
triton_poi_fused__to_copy_144 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_144(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 32448
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((52, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((52, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_144.run(*args, 32448, grid=grid(32448), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_144.benchmark_all_configs(*args, 32448, grid=grid(32448))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/tv/ctvzn4xpxjd5g7oyd7nntvpnuyovmntivt2fyxf53c3mblh6dshr.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_225
# aten.convolution => convolution_86
triton_poi_fused__to_copy_convolution_145 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[64], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_145(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 52
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((52,), (1,), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((52,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_145.run(*args, 52, grid=grid(52), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_145.benchmark_all_configs(*args, 52, grid=grid(52))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/sf/csfcbn5kzqvvv6krkr7772b7z66pr6xsuxyhtbgglpb26gkfc5h3.py
# Original ATen: aten._to_copy, aten.convolution, aten.silu
# aten._to_copy => convert_element_type_225
# aten.convolution => convolution_86
# aten.silu => convert_element_type_227, convert_element_type_228, mul_279, sigmoid_34
triton_poi_fused__to_copy_convolution_silu_146 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_silu_146(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 6656
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 52
tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp2 = tmp0 + tmp1
tmp3 = tmp2.to(tl.float32)
tmp4 = tl.sigmoid(tmp3)
tmp5 = tmp3 * tmp4
tmp6 = tmp5.to(tl.float32)
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, xmask)
def get_args():
arg_0 = rand_strided((128, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((52,), (1,), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_silu_146.run(*args, 6656, grid=grid(6656), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_silu_146.benchmark_all_configs(*args, 6656, grid=grid(6656))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ik/cikbdagjlpyedk3af4it4ary3yx6g4gpfifz25vxc7r25q224d43.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_230
triton_poi_fused__to_copy_147 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_147(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 32448
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((624, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((624, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_147.run(*args, 32448, grid=grid(32448), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_147.benchmark_all_configs(*args, 32448, grid=grid(32448))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/xb/cxb5poqew2wpvurnixpesahtvwzfiadakefnqud2hzokvcgnemog.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_231
triton_poi_fused__to_copy_148 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_148(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 99840
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((160, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((160, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_148.run(*args, 99840, grid=grid(99840), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_148.benchmark_all_configs(*args, 99840, grid=grid(99840))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/z7/cz7tf27fdbr7sfmkjktkou2imnr2mnlt7hicdhvs4chclhrw6j37.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_232, var_mean_35
triton_red_fused__native_batch_norm_legit_functional_149 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_149(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 640
rnumel = 6272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex % 160
x1 = (xindex // 160)
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = tl.load(in_ptr0 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tl.store(out_ptr0 + x3, tmp2, xmask)
def get_args():
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 160, 1, 1, 4), (640, 1, 640, 640, 160), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_149.run(*args, 640, 6272, grid=grid(640), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_149.benchmark_all_configs(*args, 640, 6272, grid=grid(640))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/hi/chiuub4omrsfimb4p75xk4vlbwv5mhljjinvhtfc3kqhmfnsyq4q.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_185, convert_element_type_232, mul_282, mul_283, var_mean_35
triton_per_fused__native_batch_norm_legit_functional_150 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[256, 4],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_150(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 160
rnumel = 4
RBLOCK: tl.constexpr = 4
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (160*r1)), rmask & xmask, other=0)
tmp8 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 25088.0
tmp5 = tmp3 / tmp4
tmp6 = 0.1
tmp7 = tmp5 * tmp6
tmp9 = 0.9
tmp10 = tmp8 * tmp9
tmp11 = tmp7 + tmp10
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask)
def get_args():
arg_0 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((1, 160, 1, 1, 4), (640, 1, 640, 640, 160), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_150.run(*args, 160, 4, grid=grid(160), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_150.benchmark_all_configs(*args, 160, 4, grid=grid(160))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/4b/c4bvtzglf3eflssbrfazd6mjdgmrhoujx4xq6fve2tfmkwyeseru.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => convert_element_type_232, var_mean_35
triton_red_fused__native_batch_norm_legit_functional_151 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_151(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 640
rnumel = 6272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex % 160
x1 = (xindex // 160)
tmp2 = tl.load(in_ptr1 + (x0), xmask)
_tmp5 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
x3 = xindex
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r2 = rindex
tmp0 = tl.load(in_ptr0 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp4 = tmp3 * tmp3
_tmp5 = tl.where(rmask & xmask, _tmp5 + tmp4, _tmp5)
tmp5 = tl.sum(_tmp5, 1)[:, None]
tl.store(out_ptr0 + x3, tmp5, xmask)
def get_args():
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 160, 1, 1, 4), (640, 1, 640, 640, 160), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_151.run(*args, 640, 6272, grid=grid(640), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_151.benchmark_all_configs(*args, 640, 6272, grid=grid(640))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/vo/cvo4hmhbgzgrpmknfykmtsywf2s2dd4n7xkbf6ll6hxoj5qx3zny.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_184, add_186, convert_element_type_232, mul_284, mul_285, mul_286, rsqrt_35, squeeze_106, var_mean_35
triton_per_fused__native_batch_norm_legit_functional_152 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[256, 4],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_152(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 160
rnumel = 4
RBLOCK: tl.constexpr = 4
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r1 = rindex
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (160*r1)), rmask & xmask, other=0)
tmp13 = tl.load(in_ptr1 + (x0), xmask)
tmp2 = tl.where(rmask & xmask, tmp0, 0)
tmp3 = tl.sum(tmp2, 1)[:, None]
tmp4 = 25088.0
tmp5 = tmp3 / tmp4
tmp6 = 1e-05
tmp7 = tmp5 + tmp6
tmp8 = tl.math.rsqrt(tmp7)
tmp9 = 1.0000398612827361
tmp10 = tmp5 * tmp9
tmp11 = 0.1
tmp12 = tmp10 * tmp11
tmp14 = 0.9
tmp15 = tmp13 * tmp14
tmp16 = tmp12 + tmp15
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask)
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask)
tl.store(out_ptr0 + x0, tmp3, xmask)
def get_args():
arg_0 = rand_strided((1, 160, 1, 1, 4), (640, 1, 640, 640, 160), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_152.run(*args, 160, 4, grid=grid(160), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_152.benchmark_all_configs(*args, 160, 4, grid=grid(160))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/7r/c7rznphkbqtbupvosh2bugs6qz6cx4gcbzlpwzvb44okdkqoc7am.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_184, add_187, convert_element_type_232, convert_element_type_233, mul_281, mul_287, rsqrt_35, sub_35, var_mean_35
triton_poi_fused__native_batch_norm_legit_functional_153 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_153(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4014080
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 196) % 160
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 25088.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None)
def get_args():
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_153.run(*args, 4014080, grid=grid(4014080), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_153.benchmark_all_configs(*args, 4014080, grid=grid(4014080))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/25/c25hctynyjlhient6ceczvttfs4npfgnfoty5zhk6pn4ckhqm3ka.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_234
triton_poi_fused__to_copy_154 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_154(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 19200
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_154.run(*args, 19200, grid=grid(19200), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_154.benchmark_all_configs(*args, 19200, grid=grid(19200))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/yz/cyzjinfa2nwvymmtkqoovwkfypkotg6ek2sunwsuf3vfd35r7l3k.py
# Original ATen: aten.cat
# aten.cat => cat_25
triton_poi_fused_cat_155 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_155(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 6021120
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 47040
x1 = (xindex // 47040)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (94080*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 240, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_155.run(*args, 6021120, grid=grid(6021120), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_155.benchmark_all_configs(*args, 6021120, grid=grid(6021120))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ks/cksikkrsggj4vc4g42rphhvyb3ox3tanqawsqpy2rpo5qbqc3tin.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_189, add_190, add_191, convert_element_type_236, mul_289, mul_290, mul_291, mul_292, mul_293, rsqrt_36, squeeze_109, var_mean_36
triton_red_fused__native_batch_norm_legit_functional_156 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[512, 32768],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_156(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 480
rnumel = 25088
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 196
r2 = (rindex // 196)
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tmp7 = tl.load(in_ptr1 + (x0), xmask)
tmp3 = 25088.0
tmp4 = tmp2 / tmp3
tmp5 = 0.1
tmp6 = tmp4 * tmp5
tmp8 = 0.9
tmp9 = tmp7 * tmp8
tmp10 = tmp6 + tmp9
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 196
r2 = (rindex // 196)
tmp11 = tl.load(in_ptr0 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp12 = tmp11.to(tl.float32)
tmp13 = tmp12 - tmp4
tmp14 = tmp13 * tmp13
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
tmp15 = tl.sum(_tmp15, 1)[:, None]
tl.store(out_ptr1 + x0, tmp15, xmask)
tmp25 = tl.load(in_ptr2 + (x0), xmask)
tmp16 = 25088.0
tmp17 = tmp15 / tmp16
tmp18 = 1e-05
tmp19 = tmp17 + tmp18
tmp20 = tl.math.rsqrt(tmp19)
tmp21 = 1.0000398612827361
tmp22 = tmp17 * tmp21
tmp23 = 0.1
tmp24 = tmp22 * tmp23
tmp26 = 0.9
tmp27 = tmp25 * tmp26
tmp28 = tmp24 + tmp27
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)
def get_args():
arg_0 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_156.run(*args, 480, 25088, grid=grid(480), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_156.benchmark_all_configs(*args, 480, 25088, grid=grid(480))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/uq/cuq6a4x4agmkmfpm3znliwswlpgc26f3bnjc5ylnl5l6aeuugh6f.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub
# aten._native_batch_norm_legit_functional => add_189, add_192, convert_element_type_236, convert_element_type_237, mul_288, mul_294, rsqrt_36, sub_36, var_mean_36
# aten.add => add_336
# aten.clone => clone_27
# aten.fill => full_like_20
# aten.mul => mul_747, mul_748
# aten.sigmoid => sigmoid_84
# aten.sub => sub_169
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_157 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_157(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
xnumel = 12042240
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 196) % 480
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 25088.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tl.sigmoid(tmp15)
tmp17 = 1.0
tmp18 = tmp17 - tmp16
tmp19 = tmp15 * tmp18
tmp20 = tmp19 + tmp17
tmp21 = tmp16 * tmp20
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None)
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp21, None)
def get_args():
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_157.run(*args, 12042240, grid=grid(12042240), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_157.benchmark_all_configs(*args, 12042240, grid=grid(12042240))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/hb/chbvxlmers2jwf43nczkvjub2nxsnhvu5csbgr2u6jwg4rlcdpzl.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_242
triton_poi_fused_split_with_sizes_158 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_158(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3010560
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 23520
x1 = (xindex // 23520)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (94080*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_158.run(*args, 3010560, grid=grid(3010560), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_158.benchmark_all_configs(*args, 3010560, grid=grid(3010560))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/cu/ccuyxkbjwe7ff4mqowk3q7to3cbjg3o7g2sh6doygm5y2bnlvlfi.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_241
triton_poi_fused__to_copy_159 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_159(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3000
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_159.run(*args, 3000, grid=grid(3000), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_159.benchmark_all_configs(*args, 3000, grid=grid(3000))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/dg/cdglxs23pt7h2wls62uy6ewinjtjctcradtpw76t3qcrvnhso4kr.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_247
triton_poi_fused_split_with_sizes_160 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_160(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3010560
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 23520
x1 = (xindex // 23520)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (23520 + x0 + (94080*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_160.run(*args, 3010560, grid=grid(3010560), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_160.benchmark_all_configs(*args, 3010560, grid=grid(3010560))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/v4/cv4uf7ojdb6edtvnqbvbxjvi624yvygrldwzdl6uy2gdzz3sghqp.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_242
triton_poi_fused__to_copy_161 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_161(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 5880
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_161.run(*args, 5880, grid=grid(5880), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_161.benchmark_all_configs(*args, 5880, grid=grid(5880))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/bn/cbnnjypiavc2wmccrce2li6mllqpp2n4naaetozpnboypgpizsqn.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_252
triton_poi_fused_split_with_sizes_162 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_162(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3010560
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 23520
x1 = (xindex // 23520)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (47040 + x0 + (94080*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_162.run(*args, 3010560, grid=grid(3010560), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_162.benchmark_all_configs(*args, 3010560, grid=grid(3010560))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/4m/c4mgwvuur32h35cj53cazh6flbatygb5baks32qmyo4zqc7drw5k.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_243
triton_poi_fused__to_copy_163 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_163(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 9720
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_163.run(*args, 9720, grid=grid(9720), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_163.benchmark_all_configs(*args, 9720, grid=grid(9720))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/4m/c4mepa4uirogjqorspxpl4t5fx2rzmjdzatpwzpg64qrcyuvg7m4.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_257
triton_poi_fused_split_with_sizes_164 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_164(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3010560
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 23520
x1 = (xindex // 23520)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (70560 + x0 + (94080*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_164.run(*args, 3010560, grid=grid(3010560), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_164.benchmark_all_configs(*args, 3010560, grid=grid(3010560))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/qv/cqvo24surldsdqwccrhisp4chcnkg2otiq5qykrtymzijwmytnb7.py
# Original ATen: aten.cat
# aten.cat => cat_26
triton_poi_fused_cat_165 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_165(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 3010560
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 23520
x1 = (xindex // 23520)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (94080*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 120, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_165.run(*args, 3010560, grid=grid(3010560), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_165.benchmark_all_configs(*args, 3010560, grid=grid(3010560))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/25/c25wuycbavajx7kwcvwpcba6kxp6j5oofnjkbplrrxckdmxy3shz.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu
# aten._native_batch_norm_legit_functional => add_194, add_197, convert_element_type_244, convert_element_type_245, mul_296, mul_302, rsqrt_37, sub_37, var_mean_37
# aten.mean => mean_9
# aten.silu => convert_element_type_246, convert_element_type_247, mul_303, sigmoid_37
triton_per_fused__native_batch_norm_legit_functional_mean_silu_166 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@persistent_reduction(
size_hints=[65536, 256],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}
)
@triton.jit
def triton_per_fused__native_batch_norm_legit_functional_mean_silu_166(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr):
xnumel = 61440
rnumel = 196
RBLOCK: tl.constexpr = 256
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rindex = tl.arange(0, RBLOCK)[None, :]
rmask = rindex < rnumel
r2 = rindex
x3 = xindex
x0 = xindex % 480
tmp0 = tl.load(in_ptr0 + (r2 + (196*x3)), rmask, other=0).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x0), None)
tmp4 = tl.load(in_ptr2 + (x0), None)
tmp11 = tl.load(in_ptr3 + (x0), None)
tmp13 = tl.load(in_ptr4 + (x0), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 25088.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tmp15.to(tl.float32)
tmp17 = tl.sigmoid(tmp16)
tmp18 = tmp16 * tmp17
tmp19 = tmp18.to(tl.float32)
tmp20 = tmp19.to(tl.float32)
tmp22 = tl.where(rmask, tmp20, 0)
tmp23 = tl.sum(tmp22, 1)[:, None]
tmp24 = 196.0
tmp25 = tmp23 / tmp24
tmp26 = tmp25.to(tl.float32)
tl.store(out_ptr0 + (r2 + (196*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask)
tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None)
def get_args():
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_per_fused__native_batch_norm_legit_functional_mean_silu_166.run(*args, 61440, 196, grid=grid(61440), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_per_fused__native_batch_norm_legit_functional_mean_silu_166.benchmark_all_configs(*args, 61440, 196, grid=grid(61440))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/vh/cvhn4ymz4o7zw5xdmhfhtnvjkm7lfirplpypnuvceox4nfhcy7wo.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_249
triton_poi_fused__to_copy_167 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_167(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 38400
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_167.run(*args, 38400, grid=grid(38400), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_167.benchmark_all_configs(*args, 38400, grid=grid(38400))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/uc/cuc5yc2wu6mlfc426p45ysrpjdvcf67n4fnoghr4zijaa4qc4jub.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_248
# aten.convolution => convolution_95
triton_poi_fused__to_copy_convolution_168 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[128], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_168(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 80
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((80,), (1,), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((80,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_168.run(*args, 80, grid=grid(80), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_168.benchmark_all_configs(*args, 80, grid=grid(80))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/w4/cw4m6y7egnkzpotuw7yanoiegenydq7w76ehn3uqkz73dfsxrsvq.py
# Original ATen: aten._to_copy, aten.convolution, aten.silu
# aten._to_copy => convert_element_type_248
# aten.convolution => convolution_95
# aten.silu => convert_element_type_250, convert_element_type_251, mul_304, sigmoid_38
triton_poi_fused__to_copy_convolution_silu_169 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_silu_169(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 10240
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 80
tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
tmp2 = tmp0 + tmp1
tmp3 = tmp2.to(tl.float32)
tmp4 = tl.sigmoid(tmp3)
tmp5 = tmp3 * tmp4
tmp6 = tmp5.to(tl.float32)
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, None)
def get_args():
arg_0 = rand_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((80,), (1,), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_silu_169.run(*args, 10240, grid=grid(10240), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_silu_169.benchmark_all_configs(*args, 10240, grid=grid(10240))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/5w/c5wu2byqoqpdpilokyxdylwqvzl3vlutvi3qed6ouxeny7v3ma4k.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_253
triton_poi_fused__to_copy_170 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_170(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 38400
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_170.run(*args, 38400, grid=grid(38400), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_170.benchmark_all_configs(*args, 38400, grid=grid(38400))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/t6/ct65ncuugfm3kp7u2z2xcftkozxdb7egjj3feed7rrbxlnc25mxn.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_252
# aten.convolution => convolution_96
triton_poi_fused__to_copy_convolution_171 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[512], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_171(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 480
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_171.run(*args, 480, grid=grid(480), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_171.benchmark_all_configs(*args, 480, grid=grid(480))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/q3/cq36nnpdlb4xwdvwsiktoah4r62ugwqlfeikgt4twhaobbsdqnku.py
# Original ATen: aten._to_copy, aten.convolution
# aten._to_copy => convert_element_type_252
# aten.convolution => convolution_96
triton_poi_fused__to_copy_convolution_172 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_convolution_172(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 61440
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 480
tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)
tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
tmp2 = tmp0 + tmp1
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None)
def get_args():
arg_0 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_convolution_172.run(*args, 61440, grid=grid(61440), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_convolution_172.benchmark_all_configs(*args, 61440, grid=grid(61440))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ce/cce2z44226346eg4knvdzt24sfsujmrpjne4vcofj37miaugwnek.py
# Original ATen: aten.mul, aten.sigmoid, aten.silu
# aten.mul => mul_305
# aten.sigmoid => sigmoid_39
# aten.silu => convert_element_type_246, convert_element_type_247, mul_303, sigmoid_37
triton_poi_fused_mul_sigmoid_silu_173 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]})
@triton.jit
def triton_poi_fused_mul_sigmoid_silu_173(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 12042240
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x1 = (xindex // 196)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tmp6 = tl.sigmoid(tmp5)
tmp7 = tmp4 * tmp6
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None)
def get_args():
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_mul_sigmoid_silu_173.run(*args, 12042240, grid=grid(12042240), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_mul_sigmoid_silu_173.benchmark_all_configs(*args, 12042240, grid=grid(12042240))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/jt/cjtarvyynr3tkfvm37qgtcwaksnjyb62otlxal5l5thrlfpz4yoz.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_260
triton_poi_fused_split_with_sizes_174 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_174(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 6021120
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 47040
x1 = (xindex // 47040)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (94080*x1)), None).to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_174.run(*args, 6021120, grid=grid(6021120), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_174.benchmark_all_configs(*args, 6021120, grid=grid(6021120))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/zb/czbuvo3mdbqgu4vhbh2rqqgxj7l4q2oef5prb5xafk4aclfa4wgv.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_261
triton_poi_fused_split_with_sizes_175 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_175(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 6021120
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 47040
x1 = (xindex // 47040)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (47040 + x0 + (94080*x1)), None).to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_175.run(*args, 6021120, grid=grid(6021120), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_175.benchmark_all_configs(*args, 6021120, grid=grid(6021120))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/fo/cfobovxlpvp6vyrqqsfdz6jlgrpfwpemctlxrrgsykw3zih3en3k.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_254
triton_poi_fused__to_copy_176 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_176(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 19200
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_176.run(*args, 19200, grid=grid(19200), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_176.benchmark_all_configs(*args, 19200, grid=grid(19200))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/66/c66wd4retmi6azmfzfxmll5pmv3btijpcst56cykpuynmmhzhq23.py
# Original ATen: aten.cat
# aten.cat => cat_27
triton_poi_fused_cat_177 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_177(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2007040
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 15680
x1 = (xindex // 15680)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (31360*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 80, 14, 14), (15680, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 80, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_177.run(*args, 2007040, grid=grid(2007040), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_177.benchmark_all_configs(*args, 2007040, grid=grid(2007040))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/4x/c4xzenlwjgchxhkagftgwbnakluheaf3fbwuqeozpqdbo7czlfp4.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add
# aten._native_batch_norm_legit_functional => add_199, add_202, convert_element_type_256, convert_element_type_257, mul_306, mul_312, rsqrt_38, sub_38, var_mean_38
# aten.add => add_203
triton_poi_fused__native_batch_norm_legit_functional_add_178 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_178(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4014080
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 196) % 160
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp16 = tl.load(in_ptr5 + (x3), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 25088.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp17 = tmp15 + tmp16
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp17, None)
def get_args():
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_178.run(*args, 4014080, grid=grid(4014080), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_178.benchmark_all_configs(*args, 4014080, grid=grid(4014080))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/xt/cxt5mn4i2rohwchtz75fbvbykzhjdnjrf3qopbwsyyqyu5jqlzr6.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_306
triton_poi_fused__to_copy_179 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_179(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 153600
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), None)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None)
def get_args():
arg_0 = rand_strided((960, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((960, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_179.run(*args, 153600, grid=grid(153600), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_179.benchmark_all_configs(*args, 153600, grid=grid(153600))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/g5/cg527ugqaap3ndd26uaeekojscsqxex2xmpgnu55nr6zwyzt6pfr.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_237, add_238, add_239, convert_element_type_307, mul_364, mul_365, mul_366, mul_367, mul_368, rsqrt_45, squeeze_136, var_mean_45
triton_red_fused__native_batch_norm_legit_functional_180 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 32768],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_180(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 960
rnumel = 25088
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 196
r2 = (rindex // 196)
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tmp7 = tl.load(in_ptr1 + (x0), xmask)
tmp3 = 25088.0
tmp4 = tmp2 / tmp3
tmp5 = 0.1
tmp6 = tmp4 * tmp5
tmp8 = 0.9
tmp9 = tmp7 * tmp8
tmp10 = tmp6 + tmp9
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 196
r2 = (rindex // 196)
tmp11 = tl.load(in_ptr0 + (r1 + (196*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp12 = tmp11.to(tl.float32)
tmp13 = tmp12 - tmp4
tmp14 = tmp13 * tmp13
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
tmp15 = tl.sum(_tmp15, 1)[:, None]
tl.store(out_ptr1 + x0, tmp15, xmask)
tmp25 = tl.load(in_ptr2 + (x0), xmask)
tmp16 = 25088.0
tmp17 = tmp15 / tmp16
tmp18 = 1e-05
tmp19 = tmp17 + tmp18
tmp20 = tl.math.rsqrt(tmp19)
tmp21 = 1.0000398612827361
tmp22 = tmp17 * tmp21
tmp23 = 0.1
tmp24 = tmp22 * tmp23
tmp26 = 0.9
tmp27 = tmp25 * tmp26
tmp28 = tmp24 + tmp27
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)
def get_args():
arg_0 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_180.run(*args, 960, 25088, grid=grid(960), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_180.benchmark_all_configs(*args, 960, 25088, grid=grid(960))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/dy/cdyo4fykv4n564s3r3eahdxt2kdz67oeovon5bt5bw2lnq6iiowx.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub
# aten._native_batch_norm_legit_functional => add_237, add_240, convert_element_type_307, convert_element_type_308, mul_363, mul_369, rsqrt_45, sub_45, var_mean_45
# aten.add => add_322
# aten.clone => clone_36
# aten.fill => full_like_11
# aten.mul => mul_627, mul_628
# aten.sigmoid => sigmoid_75
# aten.sub => sub_121
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_181 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]})
@triton.jit
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_181(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
xnumel = 24084480
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x3 = xindex
x1 = (xindex // 196) % 960
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
tmp2 = tl.load(in_ptr1 + (x1), None)
tmp4 = tl.load(in_ptr2 + (x1), None)
tmp11 = tl.load(in_ptr3 + (x1), None)
tmp13 = tl.load(in_ptr4 + (x1), None)
tmp1 = tmp0.to(tl.float32)
tmp3 = tmp1 - tmp2
tmp5 = 25088.0
tmp6 = tmp4 / tmp5
tmp7 = 1e-05
tmp8 = tmp6 + tmp7
tmp9 = tl.math.rsqrt(tmp8)
tmp10 = tmp3 * tmp9
tmp12 = tmp10 * tmp11
tmp14 = tmp12 + tmp13
tmp15 = tmp14.to(tl.float32)
tmp16 = tl.sigmoid(tmp15)
tmp17 = 1.0
tmp18 = tmp17 - tmp16
tmp19 = tmp15 * tmp18
tmp20 = tmp19 + tmp17
tmp21 = tmp16 * tmp20
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None)
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp21, None)
def get_args():
arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32)
arg_2 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_6 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_181.run(*args, 24084480, grid=grid(24084480), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_181.benchmark_all_configs(*args, 24084480, grid=grid(24084480))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/3n/c3n35ckjp4qebx5kiwvfxztqxmlk7uwm2vsaphijnyencaajb4gh.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_311
triton_poi_fused__to_copy_182 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_182(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 2160
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((240, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((240, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_182.run(*args, 2160, grid=grid(2160), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_182.benchmark_all_configs(*args, 2160, grid=grid(2160))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ow/cowqeeeuzgoqgwjjram7xczyaz465plzun33c7elszbys2k2zrzn.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_330
triton_poi_fused_split_with_sizes_183 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_183(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 6021120
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 47040
x1 = (xindex // 47040)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (x0 + (188160*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_183.run(*args, 6021120, grid=grid(6021120), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_183.benchmark_all_configs(*args, 6021120, grid=grid(6021120))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/qx/cqxoadvgndjsqu4bltg3sq7l3fij2isfiihadwqnkb3ibmloxfyz.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_312
triton_poi_fused__to_copy_184 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_184(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 6000
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((240, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((240, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_184.run(*args, 6000, grid=grid(6000), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_184.benchmark_all_configs(*args, 6000, grid=grid(6000))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/vk/cvkedu5ynhqgwwdblwjx7rst7rz42xs7fl3vaof4iorofkc5cxam.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_335
triton_poi_fused_split_with_sizes_185 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_185(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 6021120
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 47040
x1 = (xindex // 47040)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (47040 + x0 + (188160*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_185.run(*args, 6021120, grid=grid(6021120), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_185.benchmark_all_configs(*args, 6021120, grid=grid(6021120))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/nv/cnv3es7i7p4naxsxsc56c2w7slsaxha5zh6gcrrgizefpo7zps5s.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_313
triton_poi_fused__to_copy_186 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_186(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 11760
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((240, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((240, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_186.run(*args, 11760, grid=grid(11760), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_186.benchmark_all_configs(*args, 11760, grid=grid(11760))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/3q/c3qkjjd4hxw6ylyfib2kl5k6u5atp2kcpplda3gzr6acmswhzitj.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_340
triton_poi_fused_split_with_sizes_187 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_187(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 6021120
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 47040
x1 = (xindex // 47040)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (94080 + x0 + (188160*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_187.run(*args, 6021120, grid=grid(6021120), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_187.benchmark_all_configs(*args, 6021120, grid=grid(6021120))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/l7/cl7v5hh5jck72nsc2u3t2ieraqxarpadxdpy5nt2hkbbqrgcinn7.py
# Original ATen: aten._to_copy
# aten._to_copy => convert_element_type_314
triton_poi_fused__to_copy_188 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused__to_copy_188(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 19440
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tmp1 = tmp0.to(tl.float32)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask)
def get_args():
arg_0 = rand_strided((240, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((240, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused__to_copy_188.run(*args, 19440, grid=grid(19440), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused__to_copy_188.benchmark_all_configs(*args, 19440, grid=grid(19440))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/xm/cxm7drxzaido67j4aqf4vqqzkyw7ffgx32ygj3ddi3kmzigoauhg.py
# Original ATen: aten.split_with_sizes
# aten.split_with_sizes => getitem_345
triton_poi_fused_split_with_sizes_189 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_split_with_sizes_189(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 6021120
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex % 47040
x1 = (xindex // 47040)
x2 = xindex
tmp0 = tl.load(in_ptr0 + (141120 + x0 + (188160*x1)), None).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
tmp2 = tl.sigmoid(tmp1)
tmp3 = tmp1 * tmp2
tmp4 = tmp3.to(tl.float32)
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None)
def get_args():
arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_split_with_sizes_189.run(*args, 6021120, grid=grid(6021120), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_split_with_sizes_189.benchmark_all_configs(*args, 6021120, grid=grid(6021120))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/ny/cnyr36xeex2s2grdckurzoliwfqkdakrsvrmgx5fd6rzjncjxedb.py
# Original ATen: aten.cat
# aten.cat => cat_34
triton_poi_fused_cat_190 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import pointwise
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
@triton.jit
def triton_poi_fused_cat_190(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 1505280
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x2 = xindex
x0 = xindex % 11760
x1 = (xindex // 11760)
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
tl.store(out_ptr0 + (x0 + (47040*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None)
def get_args():
arg_0 = rand_strided((128, 240, 7, 7), (11760, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_1 = rand_strided((128, 240, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
return arg_0, arg_1,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_poi_fused_cat_190.run(*args, 1505280, grid=grid(1505280), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_poi_fused_cat_190.benchmark_all_configs(*args, 1505280, grid=grid(1505280))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/z6/cz6ieqk3sfzltam3yeijfwdqrtao7w4zqdsqj32cpu6zoxg3irdq.py
# Original ATen: aten._native_batch_norm_legit_functional
# aten._native_batch_norm_legit_functional => add_242, add_243, add_244, convert_element_type_315, mul_372, mul_373, mul_374, mul_375, mul_376, rsqrt_46, squeeze_139, var_mean_46
triton_red_fused__native_batch_norm_legit_functional_191 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid
@reduction(
size_hints=[1024, 8192],
reduction_hint=ReductionHint.INNER,
filename=__file__,
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]}
)
@triton.jit
def triton_red_fused__native_batch_norm_legit_functional_191(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
xnumel = 960
rnumel = 6272
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
xmask = xindex < xnumel
rbase = tl.arange(0, RBLOCK)[None, :]
x0 = xindex
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 49
r2 = (rindex // 49)
tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (47040*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp1 = tmp0.to(tl.float32)
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2)
tmp2 = tl.sum(_tmp2, 1)[:, None]
tmp7 = tl.load(in_ptr1 + (x0), xmask)
tmp3 = 6272.0
tmp4 = tmp2 / tmp3
tmp5 = 0.1
tmp6 = tmp4 * tmp5
tmp8 = 0.9
tmp9 = tmp7 * tmp8
tmp10 = tmp6 + tmp9
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask)
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
for roffset in range(0, rnumel, RBLOCK):
rindex = roffset + rbase
rmask = rindex < rnumel
r1 = rindex % 49
r2 = (rindex // 49)
tmp11 = tl.load(in_ptr0 + (r1 + (49*x0) + (47040*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32)
tmp12 = tmp11.to(tl.float32)
tmp13 = tmp12 - tmp4
tmp14 = tmp13 * tmp13
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15)
tmp15 = tl.sum(_tmp15, 1)[:, None]
tl.store(out_ptr1 + x0, tmp15, xmask)
tmp25 = tl.load(in_ptr2 + (x0), xmask)
tmp16 = 6272.0
tmp17 = tmp15 / tmp16
tmp18 = 1e-05
tmp19 = tmp17 + tmp18
tmp20 = tl.math.rsqrt(tmp19)
tmp21 = 1.0001594642002871
tmp22 = tmp17 * tmp21
tmp23 = 0.1
tmp24 = tmp22 * tmp23
tmp26 = 0.9
tmp27 = tmp25 * tmp26
tmp28 = tmp24 + tmp27
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask)
def get_args():
arg_0 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32)
arg_1 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16)
arg_2 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_3 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_4 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_5 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32)
arg_6 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
arg_7 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32)
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7,
def call(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
stream0 = get_cuda_stream(0)
triton_red_fused__native_batch_norm_legit_functional_191.run(*args, 960, 6272, grid=grid(960), stream=stream0)
def benchmark_all_configs(args):
with torch.cuda._DeviceGuard(0):
torch.cuda.set_device(0)
return triton_red_fused__native_batch_norm_legit_functional_191.benchmark_all_configs(*args, 960, 6272, grid=grid(960))
if __name__ == '__main__':
from torch._inductor.utils import get_num_bytes
from triton.testing import do_bench
args = get_args()
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9
gb_per_s = num_gb / (ms / 1e3)
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")
''')
# kernel path: /tmp/torchinductor_shunting/6y/c6ybh5xdfuwq6wdolxlmurgpxtgtn6q5shvsfwt5gf4rdpz7tas7.py
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu
# aten._native_batch_norm_legit_functional => add_242, add_245, convert_element_type_315, convert_element_type_316, mul_371, mul_377, rsqrt_46, sub_46, var_mean_46
# aten.mean => mean_12
# aten.silu => convert_element_type_317, convert_element_type_318, mul_378, sigmoid_49
triton_per_fused__native_batch_norm_legit_functional_mean_silu_192 = async_compile.triton('''
import triton
import triton.language as tl
from torch._inductor.ir import ReductionHint
from torch._inductor.ir import TileHint
from torch._inductor.triton_heuristics import persistent_reduction
from torch._inductor.utils import instance_descriptor
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
import torch
from torch._inductor.triton_heuristics import grid