Created
April 10, 2023 20:19
-
-
Save shunting314/c2a4d8a28b00fcb5586d0e9d9bf77f9f to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from ctypes import c_void_p, c_long | |
import torch | |
import math | |
import random | |
import os | |
import tempfile | |
from torch._inductor.utils import maybe_profile | |
from torch import empty_strided, as_strided, device | |
from torch._inductor.codecache import AsyncCompile | |
from torch._inductor.select_algorithm import extern_kernels | |
aten = torch.ops.aten | |
assert_size_stride = torch._C._dynamo.guards.assert_size_stride | |
async_compile = AsyncCompile() | |
import triton | |
import triton.language as tl | |
from torch._inductor.triton_heuristics import grid, start_graph, end_graph | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
# kernel path: /tmp/torchinductor_shunting/cd/ccd7accawcy3jndrehd4geosuzoh6rxkzjlqnojxb5msgrcv7udi.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type | |
triton_poi_fused__to_copy_0 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 864 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((32, 3, 3, 3), (27, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((32, 3, 3, 3), (27, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_0.run(*args, 864, grid=grid(864), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_0.benchmark_all_configs(*args, 864, grid=grid(864)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/4z/c4zfgpaaduweifmtujob2ufcjjkrkqa7mb5j2qaaysmstief35xb.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_1 | |
triton_poi_fused__to_copy_1 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_1(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 19267584 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), None) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 3, 224, 224), (150528, 50176, 224, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((128, 3, 224, 224), (150528, 50176, 224, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_1.run(*args, 19267584, grid=grid(19267584), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_1.benchmark_all_configs(*args, 19267584, grid=grid(19267584)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/4v/c4vmg7ssybcn2qvzgu3tqbkt62uopduyelvkcwxaqv2fc4glkb57.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => convert_element_type_2, var_mean | |
triton_red_fused__native_batch_norm_legit_functional_2 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[512, 131072], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_2(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 448 | |
rnumel = 114688 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex % 32 | |
x1 = (xindex // 32) | |
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = tl.load(in_ptr0 + ((112*(((r2 + (114688*x1)) // 112) % 112)) + (12544*x0) + (401408*((r2 + (114688*x1)) // 12544)) + (r2 % 112)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
tmp2 = tl.sum(_tmp2, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp2, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 32, 1, 1, 14), (448, 1, 448, 448, 32), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_2.run(*args, 448, 114688, grid=grid(448), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_2.benchmark_all_configs(*args, 448, 114688, grid=grid(448)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ax/caxxuenolffrdcs4fes5eiayfgrbnuyzkml277uyriuhejqsq7k4.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_2, convert_element_type_2, mul_1, mul_2, var_mean | |
triton_per_fused__native_batch_norm_legit_functional_3 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[32, 16], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_3(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 32 | |
rnumel = 14 | |
RBLOCK: tl.constexpr = 16 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (32*r1)), rmask & xmask, other=0) | |
tmp8 = tl.load(in_ptr1 + (x0), xmask) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp4 = 1605632.0 | |
tmp5 = tmp3 / tmp4 | |
tmp6 = 0.1 | |
tmp7 = tmp5 * tmp6 | |
tmp9 = 0.9 | |
tmp10 = tmp8 * tmp9 | |
tmp11 = tmp7 + tmp10 | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((1, 32, 1, 1, 14), (448, 1, 448, 448, 32), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_3.run(*args, 32, 14, grid=grid(32), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_3.benchmark_all_configs(*args, 32, 14, grid=grid(32)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/4s/c4sugoluux54xogsh3dgakggor26tpyllwcenrcu3hdxyyol5rhg.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => convert_element_type_2, var_mean | |
triton_red_fused__native_batch_norm_legit_functional_4 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[512, 131072], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_4(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 448 | |
rnumel = 114688 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex % 32 | |
x1 = (xindex // 32) | |
tmp2 = tl.load(in_ptr1 + (x0), xmask) | |
_tmp5 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = tl.load(in_ptr0 + ((112*(((r2 + (114688*x1)) // 112) % 112)) + (12544*x0) + (401408*((r2 + (114688*x1)) // 12544)) + (r2 % 112)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp4 = tmp3 * tmp3 | |
_tmp5 = tl.where(rmask & xmask, _tmp5 + tmp4, _tmp5) | |
tmp5 = tl.sum(_tmp5, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp5, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 32, 1, 1, 14), (448, 1, 448, 448, 32), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_4.run(*args, 448, 114688, grid=grid(448), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_4.benchmark_all_configs(*args, 448, 114688, grid=grid(448)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/yz/cyzfzxq5m7mzh26evqlnxbs2f4624mxxzlbzx5v335w7yujg6hp5.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_1, add_3, convert_element_type_2, mul_3, mul_4, mul_5, rsqrt, squeeze_1, var_mean | |
triton_per_fused__native_batch_norm_legit_functional_5 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[32, 16], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_5(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 32 | |
rnumel = 14 | |
RBLOCK: tl.constexpr = 16 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (32*r1)), rmask & xmask, other=0) | |
tmp13 = tl.load(in_ptr1 + (x0), xmask) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp4 = 1605632.0 | |
tmp5 = tmp3 / tmp4 | |
tmp6 = 1e-05 | |
tmp7 = tmp5 + tmp6 | |
tmp8 = tl.math.rsqrt(tmp7) | |
tmp9 = 1.0000006228081046 | |
tmp10 = tmp5 * tmp9 | |
tmp11 = 0.1 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = 0.9 | |
tmp15 = tmp13 * tmp14 | |
tmp16 = tmp12 + tmp15 | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask) | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask) | |
tl.store(out_ptr0 + x0, tmp3, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1, 32, 1, 1, 14), (448, 1, 448, 448, 32), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_5.run(*args, 32, 14, grid=grid(32), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_5.benchmark_all_configs(*args, 32, 14, grid=grid(32)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/tp/ctp3rh47jmesldsykk2bkq62s2n65nizu7jrip6s664llvntz6jk.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.relu | |
# aten._native_batch_norm_legit_functional => add_1, add_4, convert_element_type_2, convert_element_type_3, mul, mul_6, rsqrt, sub, var_mean | |
# aten.relu => relu | |
triton_poi_fused__native_batch_norm_legit_functional_relu_6 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_relu_6(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 51380224 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 12544) % 32 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x1), None) | |
tmp4 = tl.load(in_ptr2 + (x1), None) | |
tmp11 = tl.load(in_ptr3 + (x1), None) | |
tmp13 = tl.load(in_ptr4 + (x1), None) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 1605632.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tmp16 = tl.where(0 != 0, 0, tl.where(0 > tmp15, 0, tmp15)) | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp16, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_relu_6.run(*args, 51380224, grid=grid(51380224), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_relu_6.benchmark_all_configs(*args, 51380224, grid=grid(51380224)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/bt/cbt6gouhluogn74n5bu4s6rx7uqiiw6lwhnqaggedwz4abkhco2b.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_4 | |
triton_poi_fused__to_copy_7 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[512], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_7(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 288 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((32, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((32, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_7.run(*args, 288, grid=grid(288), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_7.benchmark_all_configs(*args, 288, grid=grid(288)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/tq/ctqzpnsuriwiat75pazkds4p5xe54pkhlftlkhx4o3jngunwgzub.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_7 | |
triton_poi_fused__to_copy_8 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_8(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1024 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((32, 32, 1, 1), (32, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((32, 32, 1, 1), (32, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_8.run(*args, 1024, grid=grid(1024), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_8.benchmark_all_configs(*args, 1024, grid=grid(1024)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ma/cma4u56mzuidifjz56xk54ie6sgj6png7qekm2tx4cs7sle553zr.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add | |
# aten._native_batch_norm_legit_functional => add_11, add_14, convert_element_type_8, convert_element_type_9, mul_14, mul_20, rsqrt_2, sub_2, var_mean_2 | |
# aten.add => add_15 | |
triton_poi_fused__native_batch_norm_legit_functional_add_9 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_9(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 51380224 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 12544) % 32 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x1), None) | |
tmp4 = tl.load(in_ptr2 + (x1), None) | |
tmp11 = tl.load(in_ptr3 + (x1), None) | |
tmp13 = tl.load(in_ptr4 + (x1), None) | |
tmp16 = tl.load(in_ptr5 + (x3), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 1605632.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tmp17 = tmp15 + tmp16 | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp17, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((32,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_6 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_9.run(*args, 51380224, grid=grid(51380224), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_9.benchmark_all_configs(*args, 51380224, grid=grid(51380224)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/mf/cmfq2hg7iduys7x4z6irkzuujylw3e7gj4ysdrcbo5cemukrg3t5.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_6 | |
triton_poi_fused_split_with_sizes_10 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_10(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 25690112 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 200704 | |
x1 = (xindex // 200704) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (401408*x1)), None).to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 16, 112, 112), (200704, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_10.run(*args, 25690112, grid=grid(25690112), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_10.benchmark_all_configs(*args, 25690112, grid=grid(25690112)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/lr/clrdqz4vueql2no5zrrsw4bgmmsgoov4oautycbtlc4q5ypfrpxe.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_7 | |
triton_poi_fused_split_with_sizes_11 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_11(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 25690112 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 200704 | |
x1 = (xindex // 200704) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (200704 + x0 + (401408*x1)), None).to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 16, 112, 112), (200704, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_11.run(*args, 25690112, grid=grid(25690112), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_11.benchmark_all_configs(*args, 25690112, grid=grid(25690112)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/wk/cwkoecpvylkd2zd4p2qzrsxvdjpmlhjy2vqltsena3lm7tbpxnqv.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_10 | |
triton_poi_fused__to_copy_12 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_12(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1536 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((96, 16, 1, 1), (16, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((96, 16, 1, 1), (16, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_12.run(*args, 1536, grid=grid(1536), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_12.benchmark_all_configs(*args, 1536, grid=grid(1536)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/yj/cyjbfwxbzmq4snbyqnu5ehgb3xenlxtbixoczmyjqmdjbuu22kqa.py | |
# Original ATen: aten.cat | |
# aten.cat => cat | |
triton_poi_fused_cat_13 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[268435456], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_13(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 154140672 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 1204224 | |
x1 = (xindex // 1204224) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (2408448*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 96, 112, 112), (1204224, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 96, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_13.run(*args, 154140672, grid=grid(154140672), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_13.benchmark_all_configs(*args, 154140672, grid=grid(154140672)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/nh/cnh3nbcj2c5nraypyukh7gkg7ocvsbjrgj7ok7wc472nt34wy65s.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => convert_element_type_12, var_mean_3 | |
triton_red_fused__native_batch_norm_legit_functional_14 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[4096, 131072], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_14(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 2496 | |
rnumel = 123511 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x1 = (xindex // 192) | |
x0 = xindex % 192 | |
_tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = r2 + (123511*x1) | |
tmp1 = 1605632 | |
tmp2 = tmp0 < tmp1 | |
tmp3 = tl.load(in_ptr0 + ((12544*x0) + (2408448*(((r2 + (123511*x1)) // 12544) % 128)) + ((r2 + (123511*x1)) % 12544) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp4 = tmp3.to(tl.float32) | |
tmp5 = tl.where(tmp2, tmp4, 0) | |
_tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6) | |
tmp6 = tl.sum(_tmp6, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp6, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 192, 1, 1, 13), (2496, 1, 2496, 2496, 192), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_14.run(*args, 2496, 123511, grid=grid(2496), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_14.benchmark_all_configs(*args, 2496, 123511, grid=grid(2496)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ed/cedbp3dhcc6xvctcve537pwhmiwi2ebf5oi5f5bppm5whslbp34w.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_18, convert_element_type_12, mul_22, mul_23, var_mean_3 | |
triton_per_fused__native_batch_norm_legit_functional_15 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[256, 16], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_15(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 192 | |
rnumel = 13 | |
RBLOCK: tl.constexpr = 16 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (192*r1)), rmask & xmask, other=0) | |
tmp8 = tl.load(in_ptr1 + (x0), xmask) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp4 = 1605632.0 | |
tmp5 = tmp3 / tmp4 | |
tmp6 = 0.1 | |
tmp7 = tmp5 * tmp6 | |
tmp9 = 0.9 | |
tmp10 = tmp8 * tmp9 | |
tmp11 = tmp7 + tmp10 | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((1, 192, 1, 1, 13), (2496, 1, 2496, 2496, 192), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_15.run(*args, 192, 13, grid=grid(192), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_15.benchmark_all_configs(*args, 192, 13, grid=grid(192)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/jk/cjk2vm3446xrk7rth7hr6pun7xxo3dnzubwcn6ydrpifal4eykrz.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => convert_element_type_12, var_mean_3 | |
triton_red_fused__native_batch_norm_legit_functional_16 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[4096, 131072], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_16(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 2496 | |
rnumel = 123511 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x1 = (xindex // 192) | |
x0 = xindex % 192 | |
_tmp9 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = r2 + (123511*x1) | |
tmp1 = 1605632 | |
tmp2 = tmp0 < tmp1 | |
tmp3 = tl.load(in_ptr0 + ((12544*x0) + (2408448*(((r2 + (123511*x1)) // 12544) % 128)) + ((r2 + (123511*x1)) % 12544) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp4 = tmp3.to(tl.float32) | |
tmp5 = tl.load(in_ptr1 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0) | |
tmp6 = tmp4 - tmp5 | |
tmp7 = tmp6 * tmp6 | |
tmp8 = tl.where(tmp2, tmp7, 0) | |
_tmp9 = tl.where(rmask & xmask, _tmp9 + tmp8, _tmp9) | |
tmp9 = tl.sum(_tmp9, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp9, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 192, 1, 1, 13), (2496, 1, 2496, 2496, 192), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_16.run(*args, 2496, 123511, grid=grid(2496), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_16.benchmark_all_configs(*args, 2496, 123511, grid=grid(2496)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/r2/cr2l4iahxzbm4xda53lhc7nels7ttppa4wfso67ohv6lwu4ut2ei.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_17, add_19, convert_element_type_12, mul_24, mul_25, mul_26, rsqrt_3, squeeze_10, var_mean_3 | |
triton_per_fused__native_batch_norm_legit_functional_17 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[256, 16], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_17(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 192 | |
rnumel = 13 | |
RBLOCK: tl.constexpr = 16 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (192*r1)), rmask & xmask, other=0) | |
tmp13 = tl.load(in_ptr1 + (x0), xmask) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp4 = 1605632.0 | |
tmp5 = tmp3 / tmp4 | |
tmp6 = 1e-05 | |
tmp7 = tmp5 + tmp6 | |
tmp8 = tl.math.rsqrt(tmp7) | |
tmp9 = 1.0000006228081046 | |
tmp10 = tmp5 * tmp9 | |
tmp11 = 0.1 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = 0.9 | |
tmp15 = tmp13 * tmp14 | |
tmp16 = tmp12 + tmp15 | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask) | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask) | |
tl.store(out_ptr0 + x0, tmp3, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1, 192, 1, 1, 13), (2496, 1, 2496, 2496, 192), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_17.run(*args, 192, 13, grid=grid(192), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_17.benchmark_all_configs(*args, 192, 13, grid=grid(192)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/7k/c7k35ubu25j3ryiaxnuo2fjojvhqvy67tox3wtaa4knc5l4vzqby.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.relu, aten.threshold_backward | |
# aten._native_batch_norm_legit_functional => add_17, add_20, convert_element_type_12, convert_element_type_13, mul_21, mul_27, rsqrt_3, sub_3, var_mean_3 | |
# aten.relu => relu_2 | |
# aten.threshold_backward => le_4 | |
triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_18 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[536870912], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*i1', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_18(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 308281344 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 12544) % 192 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x1), None) | |
tmp4 = tl.load(in_ptr2 + (x1), None) | |
tmp11 = tl.load(in_ptr3 + (x1), None) | |
tmp13 = tl.load(in_ptr4 + (x1), None) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 1605632.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tmp16 = tl.where(0 != 0, 0, tl.where(0 > tmp15, 0, tmp15)) | |
tmp17 = 0.0 | |
tmp18 = tmp16 <= tmp17 | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp16, None) | |
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp18, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.float16) | |
arg_6 = rand_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda:0', dtype=torch.bool) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_18.run(*args, 308281344, grid=grid(308281344), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_18.benchmark_all_configs(*args, 308281344, grid=grid(308281344)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/db/cdbk64vwqs4pzrtrb762fxgfklb7q573zm3neisxuhpcneczshvk.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_14 | |
triton_poi_fused__to_copy_19 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_19(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 576 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((64, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((64, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_19.run(*args, 576, grid=grid(576), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_19.benchmark_all_configs(*args, 576, grid=grid(576)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/fq/cfq5oxsb4jr7dbzocvxxr77nzkg2jk4k3rsmhwdpzkoisxiavav5.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_15 | |
triton_poi_fused__to_copy_20 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_20(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1600 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((64, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((64, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_20.run(*args, 1600, grid=grid(1600), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_20.benchmark_all_configs(*args, 1600, grid=grid(1600)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/fo/cfoj5fwz53k7xkfcu6z676snb5mum4wtda3diyiuxkkhrnlissnf.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_16 | |
triton_poi_fused__to_copy_21 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_21(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3136 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((64, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((64, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_21.run(*args, 3136, grid=grid(3136), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_21.benchmark_all_configs(*args, 3136, grid=grid(3136)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ye/cyesscl4aqq7rzyjhll5ppohr4h424gcitducugsmauai5wnvgy5.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_1 | |
triton_poi_fused_cat_22 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_22(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 25690112 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 200704 | |
x1 = (xindex // 200704) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (602112*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 64, 56, 56), (200704, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 64, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_22.run(*args, 25690112, grid=grid(25690112), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_22.benchmark_all_configs(*args, 25690112, grid=grid(25690112)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/zm/czmjlzsknic276jj2ulhzekdx7suavkcesp75xqpnfbntqprzsrx.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => convert_element_type_17, var_mean_4 | |
triton_red_fused__native_batch_norm_legit_functional_23 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[1024, 131072], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_23(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 768 | |
rnumel = 100352 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex % 192 | |
x1 = (xindex // 192) | |
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = tl.load(in_ptr0 + ((3136*x0) + (602112*(r2 // 3136)) + (19267584*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
tmp2 = tl.sum(_tmp2, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp2, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 192, 1, 1, 4), (768, 1, 768, 768, 192), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_23.run(*args, 768, 100352, grid=grid(768), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_23.benchmark_all_configs(*args, 768, 100352, grid=grid(768)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/zp/czpkjhwvqvlhuxsldhiwsf77gmyfxqbtysyyhp7lberdnzehm42l.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_23, convert_element_type_17, mul_29, mul_30, var_mean_4 | |
triton_per_fused__native_batch_norm_legit_functional_24 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[256, 4], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_24(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 192 | |
rnumel = 4 | |
RBLOCK: tl.constexpr = 4 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (192*r1)), rmask & xmask, other=0) | |
tmp8 = tl.load(in_ptr1 + (x0), xmask) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp4 = 401408.0 | |
tmp5 = tmp3 / tmp4 | |
tmp6 = 0.1 | |
tmp7 = tmp5 * tmp6 | |
tmp9 = 0.9 | |
tmp10 = tmp8 * tmp9 | |
tmp11 = tmp7 + tmp10 | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((1, 192, 1, 1, 4), (768, 1, 768, 768, 192), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_24.run(*args, 192, 4, grid=grid(192), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_24.benchmark_all_configs(*args, 192, 4, grid=grid(192)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/4o/c4ohubk3q7bkigrpxxijrzzlhs5bqi6qcn3fxomltgipjmowqgd2.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => convert_element_type_17, var_mean_4 | |
triton_red_fused__native_batch_norm_legit_functional_25 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[1024, 131072], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_25(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 768 | |
rnumel = 100352 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex % 192 | |
x1 = (xindex // 192) | |
tmp2 = tl.load(in_ptr1 + (x0), xmask) | |
_tmp5 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = tl.load(in_ptr0 + ((3136*x0) + (602112*(r2 // 3136)) + (19267584*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp4 = tmp3 * tmp3 | |
_tmp5 = tl.where(rmask & xmask, _tmp5 + tmp4, _tmp5) | |
tmp5 = tl.sum(_tmp5, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp5, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 192, 1, 1, 4), (768, 1, 768, 768, 192), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_25.run(*args, 768, 100352, grid=grid(768), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_25.benchmark_all_configs(*args, 768, 100352, grid=grid(768)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/kw/ckworjll2g6m5sbalty5hh27jhwuiyj3iqglt67oatdxz54d3igy.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_22, add_24, convert_element_type_17, mul_31, mul_32, mul_33, rsqrt_4, squeeze_13, var_mean_4 | |
triton_per_fused__native_batch_norm_legit_functional_26 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[256, 4], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_26(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 192 | |
rnumel = 4 | |
RBLOCK: tl.constexpr = 4 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (192*r1)), rmask & xmask, other=0) | |
tmp13 = tl.load(in_ptr1 + (x0), xmask) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp4 = 401408.0 | |
tmp5 = tmp3 / tmp4 | |
tmp6 = 1e-05 | |
tmp7 = tmp5 + tmp6 | |
tmp8 = tl.math.rsqrt(tmp7) | |
tmp9 = 1.0000024912370735 | |
tmp10 = tmp5 * tmp9 | |
tmp11 = 0.1 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = 0.9 | |
tmp15 = tmp13 * tmp14 | |
tmp16 = tmp12 + tmp15 | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask) | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask) | |
tl.store(out_ptr0 + x0, tmp3, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1, 192, 1, 1, 4), (768, 1, 768, 768, 192), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_26.run(*args, 192, 4, grid=grid(192), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_26.benchmark_all_configs(*args, 192, 4, grid=grid(192)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/rb/crbx6irdjcatfhmyhdhxdmxoun65x4lirw3usotlxxknzb3rnzo3.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.relu, aten.threshold_backward | |
# aten._native_batch_norm_legit_functional => add_22, add_25, convert_element_type_17, convert_element_type_18, mul_28, mul_34, rsqrt_4, sub_4, var_mean_4 | |
# aten.relu => relu_3 | |
# aten.threshold_backward => le_3 | |
triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_27 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[134217728], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*i1', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_27(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 77070336 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 3136) % 192 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x1), None) | |
tmp4 = tl.load(in_ptr2 + (x1), None) | |
tmp11 = tl.load(in_ptr3 + (x1), None) | |
tmp13 = tl.load(in_ptr4 + (x1), None) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 401408.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tmp16 = tl.where(0 != 0, 0, tl.where(0 > tmp15, 0, tmp15)) | |
tmp17 = 0.0 | |
tmp18 = tmp16 <= tmp17 | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp16, None) | |
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp18, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((192,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_6 = rand_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda:0', dtype=torch.bool) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_27.run(*args, 77070336, grid=grid(77070336), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_27.benchmark_all_configs(*args, 77070336, grid=grid(77070336)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/yr/cyrrsvkikbwrkovp7p3kp54xrx3kv5k3cuej452rtgigaetgnkzf.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_19 | |
triton_poi_fused__to_copy_28 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_28(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1920 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((20, 96, 1, 1), (96, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((20, 96, 1, 1), (96, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_28.run(*args, 1920, grid=grid(1920), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_28.benchmark_all_configs(*args, 1920, grid=grid(1920)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/lx/clxgqhjnos6ucboracnh3kmrmn5mwuc3sgkffihrftfhuwl64wg2.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_2 | |
triton_poi_fused_cat_29 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_29(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 8028160 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 62720 | |
x1 = (xindex // 62720) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (125440*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 20, 56, 56), (62720, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 20, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_29.run(*args, 8028160, grid=grid(8028160), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_29.benchmark_all_configs(*args, 8028160, grid=grid(8028160)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/e3/ce3edr4giy7ygyubb445dsh6wavenlh7yhdxr5odchwzb5blm45m.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => convert_element_type_21, var_mean_5 | |
triton_red_fused__native_batch_norm_legit_functional_30 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[1024, 32768], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_30(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 520 | |
rnumel = 30878 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x1 = (xindex // 40) | |
x0 = xindex % 40 | |
_tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = r2 + (30878*x1) | |
tmp1 = 401408 | |
tmp2 = tmp0 < tmp1 | |
tmp3 = tl.load(in_ptr0 + ((3136*x0) + (125440*(((r2 + (30878*x1)) // 3136) % 128)) + ((r2 + (30878*x1)) % 3136) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp4 = tmp3.to(tl.float32) | |
tmp5 = tl.where(tmp2, tmp4, 0) | |
_tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6) | |
tmp6 = tl.sum(_tmp6, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp6, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 40, 1, 1, 13), (520, 1, 520, 520, 40), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_30.run(*args, 520, 30878, grid=grid(520), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_30.benchmark_all_configs(*args, 520, 30878, grid=grid(520)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/5l/c5ldagjb25qlt4w6hnm2s5f7x7ddcw2evdosdg3pvhtpoduxelhs.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_28, convert_element_type_21, mul_36, mul_37, var_mean_5 | |
triton_per_fused__native_batch_norm_legit_functional_31 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[64, 16], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_31(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 40 | |
rnumel = 13 | |
RBLOCK: tl.constexpr = 16 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (40*r1)), rmask & xmask, other=0) | |
tmp8 = tl.load(in_ptr1 + (x0), xmask) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp4 = 401408.0 | |
tmp5 = tmp3 / tmp4 | |
tmp6 = 0.1 | |
tmp7 = tmp5 * tmp6 | |
tmp9 = 0.9 | |
tmp10 = tmp8 * tmp9 | |
tmp11 = tmp7 + tmp10 | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((1, 40, 1, 1, 13), (520, 1, 520, 520, 40), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_31.run(*args, 40, 13, grid=grid(40), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_31.benchmark_all_configs(*args, 40, 13, grid=grid(40)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ko/ckomyuegfrpkii72qwrquhfy6eh53az7jeqam3ghl4ajcmcw2h3l.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => convert_element_type_21, var_mean_5 | |
triton_red_fused__native_batch_norm_legit_functional_32 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[1024, 32768], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_32(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 520 | |
rnumel = 30878 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x1 = (xindex // 40) | |
x0 = xindex % 40 | |
_tmp9 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = r2 + (30878*x1) | |
tmp1 = 401408 | |
tmp2 = tmp0 < tmp1 | |
tmp3 = tl.load(in_ptr0 + ((3136*x0) + (125440*(((r2 + (30878*x1)) // 3136) % 128)) + ((r2 + (30878*x1)) % 3136) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp4 = tmp3.to(tl.float32) | |
tmp5 = tl.load(in_ptr1 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0) | |
tmp6 = tmp4 - tmp5 | |
tmp7 = tmp6 * tmp6 | |
tmp8 = tl.where(tmp2, tmp7, 0) | |
_tmp9 = tl.where(rmask & xmask, _tmp9 + tmp8, _tmp9) | |
tmp9 = tl.sum(_tmp9, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp9, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 40, 1, 1, 13), (520, 1, 520, 520, 40), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_32.run(*args, 520, 30878, grid=grid(520), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_32.benchmark_all_configs(*args, 520, 30878, grid=grid(520)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/5p/c5pcmnlklk6a6ee2vm3ta2iounjyiftuynwfzbts7sayeuto4jvn.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_27, add_29, convert_element_type_21, mul_38, mul_39, mul_40, rsqrt_5, squeeze_16, var_mean_5 | |
triton_per_fused__native_batch_norm_legit_functional_33 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[64, 16], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_33(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 40 | |
rnumel = 13 | |
RBLOCK: tl.constexpr = 16 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (40*r1)), rmask & xmask, other=0) | |
tmp13 = tl.load(in_ptr1 + (x0), xmask) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp4 = 401408.0 | |
tmp5 = tmp3 / tmp4 | |
tmp6 = 1e-05 | |
tmp7 = tmp5 + tmp6 | |
tmp8 = tl.math.rsqrt(tmp7) | |
tmp9 = 1.0000024912370735 | |
tmp10 = tmp5 * tmp9 | |
tmp11 = 0.1 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = 0.9 | |
tmp15 = tmp13 * tmp14 | |
tmp16 = tmp12 + tmp15 | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask) | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask) | |
tl.store(out_ptr0 + x0, tmp3, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1, 40, 1, 1, 13), (520, 1, 520, 520, 40), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_33.run(*args, 40, 13, grid=grid(40), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_33.benchmark_all_configs(*args, 40, 13, grid=grid(40)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/pf/cpfbdly4ax5zztymk42pnezqf7ys2im4sb437kjrqjpf4x2jbotn.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_27, add_30, convert_element_type_21, convert_element_type_22, mul_35, mul_41, rsqrt_5, sub_5, var_mean_5 | |
triton_poi_fused__native_batch_norm_legit_functional_34 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_34(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 16056320 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 3136) % 40 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x1), None) | |
tmp4 = tl.load(in_ptr2 + (x1), None) | |
tmp11 = tl.load(in_ptr3 + (x1), None) | |
tmp13 = tl.load(in_ptr4 + (x1), None) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 401408.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_34.run(*args, 16056320, grid=grid(16056320), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_34.benchmark_all_configs(*args, 16056320, grid=grid(16056320)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/jt/cjtdu5nq2b37l5fyi5c7vdk3m4aqhqeqbsw3e5ifo4u74lo5bgaq.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_23 | |
triton_poi_fused__to_copy_35 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_35(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1200 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((60, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((60, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_35.run(*args, 1200, grid=grid(1200), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_35.benchmark_all_configs(*args, 1200, grid=grid(1200)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/wj/cwjg3r37ehsowju7qerrsqlothtrltxyyuq5mhnulbp7zor3ocw5.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_3 | |
triton_poi_fused_cat_36 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_36(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 24084480 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 188160 | |
x1 = (xindex // 188160) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (376320*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 60, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_36.run(*args, 24084480, grid=grid(24084480), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_36.benchmark_all_configs(*args, 24084480, grid=grid(24084480)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/2c/c2cmgfhoarotsw2klaplncsqdxdxrbc3je4bm4idg4672cm2hbzh.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => convert_element_type_25, var_mean_6 | |
triton_red_fused__native_batch_norm_legit_functional_37 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[512, 131072], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_37(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 480 | |
rnumel = 100352 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex % 120 | |
x1 = (xindex // 120) | |
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = tl.load(in_ptr0 + ((3136*x0) + (376320*(r2 // 3136)) + (12042240*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
tmp2 = tl.sum(_tmp2, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp2, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 120, 1, 1, 4), (480, 1, 480, 480, 120), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_37.run(*args, 480, 100352, grid=grid(480), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_37.benchmark_all_configs(*args, 480, 100352, grid=grid(480)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/xq/cxq23fscob3jbjdcsao7sicyvado5nsvykscbw7syq3dh73swjgu.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_33, convert_element_type_25, mul_43, mul_44, var_mean_6 | |
triton_per_fused__native_batch_norm_legit_functional_38 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[128, 4], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_38(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 120 | |
rnumel = 4 | |
RBLOCK: tl.constexpr = 4 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (120*r1)), rmask & xmask, other=0) | |
tmp8 = tl.load(in_ptr1 + (x0), xmask) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp4 = 401408.0 | |
tmp5 = tmp3 / tmp4 | |
tmp6 = 0.1 | |
tmp7 = tmp5 * tmp6 | |
tmp9 = 0.9 | |
tmp10 = tmp8 * tmp9 | |
tmp11 = tmp7 + tmp10 | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((1, 120, 1, 1, 4), (480, 1, 480, 480, 120), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_38.run(*args, 120, 4, grid=grid(120), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_38.benchmark_all_configs(*args, 120, 4, grid=grid(120)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/wg/cwgir4s67j2t35f66rtqguw4skzrnzaazh3rvukzvjln6bwfquwz.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => convert_element_type_25, var_mean_6 | |
triton_red_fused__native_batch_norm_legit_functional_39 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[512, 131072], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_39(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 480 | |
rnumel = 100352 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex % 120 | |
x1 = (xindex // 120) | |
tmp2 = tl.load(in_ptr1 + (x0), xmask) | |
_tmp5 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = tl.load(in_ptr0 + ((3136*x0) + (376320*(r2 // 3136)) + (12042240*x1) + (r2 % 3136)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp4 = tmp3 * tmp3 | |
_tmp5 = tl.where(rmask & xmask, _tmp5 + tmp4, _tmp5) | |
tmp5 = tl.sum(_tmp5, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp5, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 120, 1, 1, 4), (480, 1, 480, 480, 120), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_39.run(*args, 480, 100352, grid=grid(480), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_39.benchmark_all_configs(*args, 480, 100352, grid=grid(480)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/bd/cbdjjrt2eltvde4ay5q7ftrif7dzdrcgxsonoef3wv6a6n446fg5.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_32, add_34, convert_element_type_25, mul_45, mul_46, mul_47, rsqrt_6, squeeze_19, var_mean_6 | |
triton_per_fused__native_batch_norm_legit_functional_40 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[128, 4], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_40(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 120 | |
rnumel = 4 | |
RBLOCK: tl.constexpr = 4 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (120*r1)), rmask & xmask, other=0) | |
tmp13 = tl.load(in_ptr1 + (x0), xmask) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp4 = 401408.0 | |
tmp5 = tmp3 / tmp4 | |
tmp6 = 1e-05 | |
tmp7 = tmp5 + tmp6 | |
tmp8 = tl.math.rsqrt(tmp7) | |
tmp9 = 1.0000024912370735 | |
tmp10 = tmp5 * tmp9 | |
tmp11 = 0.1 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = 0.9 | |
tmp15 = tmp13 * tmp14 | |
tmp16 = tmp12 + tmp15 | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask) | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask) | |
tl.store(out_ptr0 + x0, tmp3, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1, 120, 1, 1, 4), (480, 1, 480, 480, 120), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_40.run(*args, 120, 4, grid=grid(120), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_40.benchmark_all_configs(*args, 120, 4, grid=grid(120)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ff/cffbiktpj5b36houxciovjfe3g7ji5w5ibbvgjpjdr6m6dxnqiod.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.relu | |
# aten._native_batch_norm_legit_functional => add_32, add_35, convert_element_type_25, convert_element_type_26, mul_42, mul_48, rsqrt_6, sub_6, var_mean_6 | |
# aten.relu => relu_4 | |
triton_poi_fused__native_batch_norm_legit_functional_relu_41 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_relu_41(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 48168960 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 3136) % 120 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x1), None) | |
tmp4 = tl.load(in_ptr2 + (x1), None) | |
tmp11 = tl.load(in_ptr3 + (x1), None) | |
tmp13 = tl.load(in_ptr4 + (x1), None) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 401408.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tmp16 = tl.where(0 != 0, 0, tl.where(0 > tmp15, 0, tmp15)) | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp16, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_relu_41.run(*args, 48168960, grid=grid(48168960), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_relu_41.benchmark_all_configs(*args, 48168960, grid=grid(48168960)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/6y/c6ynio5pu6llppeohanybeuevixt5rgpdgxw24ylgggmbgzjacrt.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_27 | |
triton_poi_fused__to_copy_42 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_42(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1080 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_42.run(*args, 1080, grid=grid(1080), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_42.benchmark_all_configs(*args, 1080, grid=grid(1080)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/p3/cp3ysrijswutvs72wkv4ypuaradvydgclkx6swvsdiolmqtydjck.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.relu, aten.threshold_backward | |
# aten._native_batch_norm_legit_functional => add_37, add_40, convert_element_type_28, convert_element_type_29, mul_49, mul_55, rsqrt_7, sub_7, var_mean_7 | |
# aten.relu => relu_5 | |
# aten.threshold_backward => le_1 | |
triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_43 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*i1', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_43(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 48168960 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 3136) % 120 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x1), None) | |
tmp4 = tl.load(in_ptr2 + (x1), None) | |
tmp11 = tl.load(in_ptr3 + (x1), None) | |
tmp13 = tl.load(in_ptr4 + (x1), None) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 401408.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tmp16 = tl.where(0 != 0, 0, tl.where(0 > tmp15, 0, tmp15)) | |
tmp17 = 0.0 | |
tmp18 = tmp16 <= tmp17 | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp16, None) | |
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp18, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((120,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_6 = rand_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda:0', dtype=torch.bool) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_43.run(*args, 48168960, grid=grid(48168960), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_43.benchmark_all_configs(*args, 48168960, grid=grid(48168960)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/pe/cpea2gtzlc2kbhqh3fj4j3lqcnuxq36vpr46nhhxpvtecppidlkn.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_30 | |
triton_poi_fused__to_copy_44 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_44(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1200 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((20, 60, 1, 1), (60, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((20, 60, 1, 1), (60, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_44.run(*args, 1200, grid=grid(1200), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_44.benchmark_all_configs(*args, 1200, grid=grid(1200)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/nm/cnmhyqb22hbicwkw553bytawf6yat3hnc5u573hgyyuw5vi2ohc2.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add | |
# aten._native_batch_norm_legit_functional => add_42, add_45, convert_element_type_32, convert_element_type_33, mul_56, mul_62, rsqrt_8, sub_8, var_mean_8 | |
# aten.add => add_46 | |
triton_poi_fused__native_batch_norm_legit_functional_add_45 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_45(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 16056320 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 3136) % 40 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x1), None) | |
tmp4 = tl.load(in_ptr2 + (x1), None) | |
tmp11 = tl.load(in_ptr3 + (x1), None) | |
tmp13 = tl.load(in_ptr4 + (x1), None) | |
tmp16 = tl.load(in_ptr5 + (x3), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 401408.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tmp17 = tmp15 + tmp16 | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp17, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((40,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_6 = rand_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_45.run(*args, 16056320, grid=grid(16056320), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_45.benchmark_all_configs(*args, 16056320, grid=grid(16056320)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/d6/cd6uk6iyi5u7h7amewzycxkyxriny4cknptmukvtlpxi4jwlerzf.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_34 | |
triton_poi_fused__to_copy_46 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_46(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 9600 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((240, 40, 1, 1), (40, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((240, 40, 1, 1), (40, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_46.run(*args, 9600, grid=grid(9600), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_46.benchmark_all_configs(*args, 9600, grid=grid(9600)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/p3/cp3dwfqdvlpfdvfvxrfm2lik34mpytuijuqsswm7efzikdczro7d.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_48, add_49, add_50, convert_element_type_35, mul_64, mul_65, mul_66, mul_67, mul_68, rsqrt_9, squeeze_28, var_mean_9 | |
triton_red_fused__native_batch_norm_legit_functional_47 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[256, 524288], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_47(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 240 | |
rnumel = 401408 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 3136 | |
r2 = (rindex // 3136) | |
tmp0 = tl.load(in_ptr0 + (r1 + (3136*x0) + (752640*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
tmp2 = tl.sum(_tmp2, 1)[:, None] | |
tmp7 = tl.load(in_ptr1 + (x0), xmask) | |
tmp3 = 401408.0 | |
tmp4 = tmp2 / tmp3 | |
tmp5 = 0.1 | |
tmp6 = tmp4 * tmp5 | |
tmp8 = 0.9 | |
tmp9 = tmp7 * tmp8 | |
tmp10 = tmp6 + tmp9 | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask) | |
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 3136 | |
r2 = (rindex // 3136) | |
tmp11 = tl.load(in_ptr0 + (r1 + (3136*x0) + (752640*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp12 = tmp11.to(tl.float32) | |
tmp13 = tmp12 - tmp4 | |
tmp14 = tmp13 * tmp13 | |
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15) | |
tmp15 = tl.sum(_tmp15, 1)[:, None] | |
tl.store(out_ptr1 + x0, tmp15, xmask) | |
tmp25 = tl.load(in_ptr2 + (x0), xmask) | |
tmp16 = 401408.0 | |
tmp17 = tmp15 / tmp16 | |
tmp18 = 1e-05 | |
tmp19 = tmp17 + tmp18 | |
tmp20 = tl.math.rsqrt(tmp19) | |
tmp21 = 1.0000024912370735 | |
tmp22 = tmp17 * tmp21 | |
tmp23 = 0.1 | |
tmp24 = tmp22 * tmp23 | |
tmp26 = 0.9 | |
tmp27 = tmp25 * tmp26 | |
tmp28 = tmp24 + tmp27 | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask) | |
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_47.run(*args, 240, 401408, grid=grid(240), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_47.benchmark_all_configs(*args, 240, 401408, grid=grid(240)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/bn/cbnwrcxmnryz6qd2vzr5qnxb2q3vepfvhupqz2awsu4zkworteie.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub | |
# aten._native_batch_norm_legit_functional => add_48, add_51, convert_element_type_35, convert_element_type_36, mul_63, mul_69, rsqrt_9, sub_9, var_mean_9 | |
# aten.add => add_379 | |
# aten.clone => clone | |
# aten.fill => full_like_47 | |
# aten.mul => mul_1107, mul_1108 | |
# aten.sigmoid => sigmoid_111 | |
# aten.sub => sub_313 | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_48 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[134217728], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_48(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 96337920 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 3136) % 240 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x1), None) | |
tmp4 = tl.load(in_ptr2 + (x1), None) | |
tmp11 = tl.load(in_ptr3 + (x1), None) | |
tmp13 = tl.load(in_ptr4 + (x1), None) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 401408.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tmp16 = tl.sigmoid(tmp15) | |
tmp17 = 1.0 | |
tmp18 = tmp17 - tmp16 | |
tmp19 = tmp15 * tmp18 | |
tmp20 = tmp19 + tmp17 | |
tmp21 = tmp16 * tmp20 | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None) | |
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp21, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_6 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_48.run(*args, 96337920, grid=grid(96337920), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_48.benchmark_all_configs(*args, 96337920, grid=grid(96337920)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/tr/ctrlm7wxwgyqu6igydzh2qy3dgfyvkbxyf6ed5dmhs5ac43nkywg.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_39 | |
triton_poi_fused__to_copy_49 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_49(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 540 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((60, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((60, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_49.run(*args, 540, grid=grid(540), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_49.benchmark_all_configs(*args, 540, grid=grid(540)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/62/c62vp5bgz7fqdu2x3oxuvrnwwvzztezlji4zzzxqegy77wuqckqo.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_52 | |
triton_poi_fused_split_with_sizes_50 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_50(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 24084480 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 188160 | |
x1 = (xindex // 188160) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (752640*x1)), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_50.run(*args, 24084480, grid=grid(24084480), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_50.benchmark_all_configs(*args, 24084480, grid=grid(24084480)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/5k/c5k2puyq6twbirobmys3yfadgomf23ufkkjsshzn37upfh5g7erl.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_40 | |
triton_poi_fused__to_copy_51 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_51(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1500 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((60, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((60, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_51.run(*args, 1500, grid=grid(1500), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_51.benchmark_all_configs(*args, 1500, grid=grid(1500)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/qq/cqqmootvvozbchj4wyxl2cmg4iebq6hs2v2baytcez4tdix7pywu.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_57 | |
triton_poi_fused_split_with_sizes_52 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_52(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 24084480 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 188160 | |
x1 = (xindex // 188160) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (188160 + x0 + (752640*x1)), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_52.run(*args, 24084480, grid=grid(24084480), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_52.benchmark_all_configs(*args, 24084480, grid=grid(24084480)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/hn/chn7fi5d6pb7lt74jnnjikqswlxeu3nbv3mpu3aidnxkwdaw4fmq.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_41 | |
triton_poi_fused__to_copy_53 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_53(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2940 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((60, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((60, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_53.run(*args, 2940, grid=grid(2940), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_53.benchmark_all_configs(*args, 2940, grid=grid(2940)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/7d/c7d6lruapdg7jujcavuay77hawcbzn2rk3uqz6opu54tj6rxd3dc.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_62 | |
triton_poi_fused_split_with_sizes_54 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_54(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 24084480 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 188160 | |
x1 = (xindex // 188160) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (376320 + x0 + (752640*x1)), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_54.run(*args, 24084480, grid=grid(24084480), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_54.benchmark_all_configs(*args, 24084480, grid=grid(24084480)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ih/cihdtphelycdnabls3bvxcjqdwyigw5444zlobgvmbwdwzxq7zdy.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_42 | |
triton_poi_fused__to_copy_55 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_55(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 4860 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((60, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((60, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_55.run(*args, 4860, grid=grid(4860), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_55.benchmark_all_configs(*args, 4860, grid=grid(4860)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/oq/coqx7tfdhgzsd3kzw3ueczpktm7gxo5z67ciilgh24ffay7herao.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_67 | |
triton_poi_fused_split_with_sizes_56 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_56(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 24084480 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 188160 | |
x1 = (xindex // 188160) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (564480 + x0 + (752640*x1)), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_56.run(*args, 24084480, grid=grid(24084480), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_56.benchmark_all_configs(*args, 24084480, grid=grid(24084480)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/n6/cn6lwm2ninzuud2fuu3omrklbeewaxqtkv3v6gdaxmikjvyoi525.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_5 | |
triton_poi_fused_cat_57 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_57(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 6021120 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 47040 | |
x1 = (xindex // 47040) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (188160*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 60, 28, 28), (47040, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 60, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_57.run(*args, 6021120, grid=grid(6021120), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_57.benchmark_all_configs(*args, 6021120, grid=grid(6021120)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/tm/ctmfxwnpwpks3pe3wok54jftmfykafzupq5buoissf32gfcrrq6d.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_53, add_54, add_55, convert_element_type_43, mul_72, mul_73, mul_74, mul_75, mul_76, rsqrt_10, squeeze_31, var_mean_10 | |
triton_red_fused__native_batch_norm_legit_functional_58 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[256, 131072], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_58(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 240 | |
rnumel = 100352 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 784 | |
r2 = (rindex // 784) | |
tmp0 = tl.load(in_ptr0 + (r1 + (784*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
tmp2 = tl.sum(_tmp2, 1)[:, None] | |
tmp7 = tl.load(in_ptr1 + (x0), xmask) | |
tmp3 = 100352.0 | |
tmp4 = tmp2 / tmp3 | |
tmp5 = 0.1 | |
tmp6 = tmp4 * tmp5 | |
tmp8 = 0.9 | |
tmp9 = tmp7 * tmp8 | |
tmp10 = tmp6 + tmp9 | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask) | |
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 784 | |
r2 = (rindex // 784) | |
tmp11 = tl.load(in_ptr0 + (r1 + (784*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp12 = tmp11.to(tl.float32) | |
tmp13 = tmp12 - tmp4 | |
tmp14 = tmp13 * tmp13 | |
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15) | |
tmp15 = tl.sum(_tmp15, 1)[:, None] | |
tl.store(out_ptr1 + x0, tmp15, xmask) | |
tmp25 = tl.load(in_ptr2 + (x0), xmask) | |
tmp16 = 100352.0 | |
tmp17 = tmp15 / tmp16 | |
tmp18 = 1e-05 | |
tmp19 = tmp17 + tmp18 | |
tmp20 = tl.math.rsqrt(tmp19) | |
tmp21 = 1.00000996502277 | |
tmp22 = tmp17 * tmp21 | |
tmp23 = 0.1 | |
tmp24 = tmp22 * tmp23 | |
tmp26 = 0.9 | |
tmp27 = tmp25 * tmp26 | |
tmp28 = tmp24 + tmp27 | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask) | |
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_58.run(*args, 240, 100352, grid=grid(240), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_58.benchmark_all_configs(*args, 240, 100352, grid=grid(240)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/tf/ctf52h6qjyfnbwd7tzysnt6o3dvwaedcis63siipfnbkcfkukqa7.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu | |
# aten._native_batch_norm_legit_functional => add_53, add_56, convert_element_type_43, convert_element_type_44, mul_71, mul_77, rsqrt_10, sub_10, var_mean_10 | |
# aten.mean => mean | |
# aten.silu => convert_element_type_45, convert_element_type_46, mul_78, sigmoid_1 | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_59 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[32768, 1024], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_mean_silu_59(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 30720 | |
rnumel = 784 | |
RBLOCK: tl.constexpr = 1024 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r2 = rindex | |
x3 = xindex | |
x0 = xindex % 240 | |
tmp0 = tl.load(in_ptr0 + (r2 + (784*x3)), rmask, other=0).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x0), None) | |
tmp4 = tl.load(in_ptr2 + (x0), None) | |
tmp11 = tl.load(in_ptr3 + (x0), None) | |
tmp13 = tl.load(in_ptr4 + (x0), None) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 100352.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tmp16 = tmp15.to(tl.float32) | |
tmp17 = tl.sigmoid(tmp16) | |
tmp18 = tmp16 * tmp17 | |
tmp19 = tmp18.to(tl.float32) | |
tmp20 = tmp19.to(tl.float32) | |
tmp22 = tl.where(rmask, tmp20, 0) | |
tmp23 = tl.sum(tmp22, 1)[:, None] | |
tmp24 = 784.0 | |
tmp25 = tmp23 / tmp24 | |
tmp26 = tmp25.to(tl.float32) | |
tl.store(out_ptr0 + (r2 + (784*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask) | |
tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_6 = rand_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_59.run(*args, 30720, 784, grid=grid(30720), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_mean_silu_59.benchmark_all_configs(*args, 30720, 784, grid=grid(30720)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/l4/cl43m5yr7mr6rofdxkde5hoinowoes7zmspib6uw4wfhtoztyd2j.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_48 | |
triton_poi_fused__to_copy_60 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_60(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 4800 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((20, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((20, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_60.run(*args, 4800, grid=grid(4800), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_60.benchmark_all_configs(*args, 4800, grid=grid(4800)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/n3/cn3yrrvjz2rkdvv5cbgdyj5mk5wgjpljfxyga75evi5tqahwagdd.py | |
# Original ATen: aten._to_copy, aten.convolution | |
# aten._to_copy => convert_element_type_47 | |
# aten.convolution => convolution_20 | |
triton_poi_fused__to_copy_convolution_61 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[32], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_convolution_61(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 20 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((20,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((20,), (1,), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_convolution_61.run(*args, 20, grid=grid(20), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_convolution_61.benchmark_all_configs(*args, 20, grid=grid(20)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/nz/cnz5cjjblhni4cibgggtjl3kn4ksihmqz6znyn5belgi4q43cncw.py | |
# Original ATen: aten._to_copy, aten.convolution, aten.silu | |
# aten._to_copy => convert_element_type_47 | |
# aten.convolution => convolution_20 | |
# aten.silu => convert_element_type_49, convert_element_type_50, mul_79, sigmoid_2 | |
triton_poi_fused__to_copy_convolution_silu_62 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_convolution_silu_62(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2560 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 20 | |
tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32) | |
tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp2 = tmp0 + tmp1 | |
tmp3 = tmp2.to(tl.float32) | |
tmp4 = tl.sigmoid(tmp3) | |
tmp5 = tmp3 * tmp4 | |
tmp6 = tmp5.to(tl.float32) | |
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((20,), (1,), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_convolution_silu_62.run(*args, 2560, grid=grid(2560), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_convolution_silu_62.benchmark_all_configs(*args, 2560, grid=grid(2560)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/bl/cblr4zz2jdn5lx7fyq6hto375tcressl73jxrxvozhqbubkoilua.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_52 | |
triton_poi_fused__to_copy_63 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_63(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 4800 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((240, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((240, 20, 1, 1), (20, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_63.run(*args, 4800, grid=grid(4800), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_63.benchmark_all_configs(*args, 4800, grid=grid(4800)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/gq/cgqatezdistnqv4yyabd7t76wkyduditccoaqjd3ef4npoxk6vrv.py | |
# Original ATen: aten._to_copy, aten.convolution | |
# aten._to_copy => convert_element_type_51 | |
# aten.convolution => convolution_21 | |
triton_poi_fused__to_copy_convolution_64 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[256], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_convolution_64(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 240 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_convolution_64.run(*args, 240, grid=grid(240), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_convolution_64.benchmark_all_configs(*args, 240, grid=grid(240)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/qr/cqrlwq46vnra4itz3o4m3cvwrnyl7srsa7xlwywcqzni2rafclhf.py | |
# Original ATen: aten._to_copy, aten.convolution | |
# aten._to_copy => convert_element_type_51 | |
# aten.convolution => convolution_21 | |
triton_poi_fused__to_copy_convolution_65 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_convolution_65(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 30720 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 240 | |
tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32) | |
tmp2 = tmp0 + tmp1 | |
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((240,), (1,), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_convolution_65.run(*args, 30720, grid=grid(30720), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_convolution_65.benchmark_all_configs(*args, 30720, grid=grid(30720)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ru/crubekjujrhilkb2spkvdoiiitif5a5cisikyju6vjbn7j3itjum.py | |
# Original ATen: aten.mul, aten.sigmoid, aten.silu | |
# aten.mul => mul_80 | |
# aten.sigmoid => sigmoid_3 | |
# aten.silu => convert_element_type_45, convert_element_type_46, mul_78, sigmoid_1 | |
triton_poi_fused_mul_sigmoid_silu_66 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_mul_sigmoid_silu_66(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 24084480 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x1 = (xindex // 784) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tmp6 = tl.sigmoid(tmp5) | |
tmp7 = tmp4 * tmp6 | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_mul_sigmoid_silu_66.run(*args, 24084480, grid=grid(24084480), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_mul_sigmoid_silu_66.benchmark_all_configs(*args, 24084480, grid=grid(24084480)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/va/cvacncosevkh7a4e4mjytw5gvejhihlmvkwma42dwu2nfhypnbzr.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_53 | |
triton_poi_fused__to_copy_67 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_67(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 13440 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((56, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((56, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_67.run(*args, 13440, grid=grid(13440), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_67.benchmark_all_configs(*args, 13440, grid=grid(13440)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/hv/chvkvxc6xaoyzdfyhbbbolrujmg4wuzt7gkh6x4thkny6u2hr4gg.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => convert_element_type_54, var_mean_11 | |
triton_red_fused__native_batch_norm_legit_functional_68 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[1024, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_68(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 728 | |
rnumel = 7720 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x1 = (xindex // 56) | |
x0 = xindex % 56 | |
_tmp6 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = r2 + (7720*x1) | |
tmp1 = 100352 | |
tmp2 = tmp0 < tmp1 | |
tmp3 = tl.load(in_ptr0 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp4 = tmp3.to(tl.float32) | |
tmp5 = tl.where(tmp2, tmp4, 0) | |
_tmp6 = tl.where(rmask & xmask, _tmp6 + tmp5, _tmp6) | |
tmp6 = tl.sum(_tmp6, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp6, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 56, 1, 1, 13), (728, 1, 728, 728, 56), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_68.run(*args, 728, 7720, grid=grid(728), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_68.benchmark_all_configs(*args, 728, 7720, grid=grid(728)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ih/cihgiw3fxfncw6wmlie2fnq4egrnxl6aq2aj37mt4r5kpoemk2yw.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_59, convert_element_type_54, mul_82, mul_83, var_mean_11 | |
triton_per_fused__native_batch_norm_legit_functional_69 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[64, 16], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_69(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 56 | |
rnumel = 13 | |
RBLOCK: tl.constexpr = 16 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (56*r1)), rmask & xmask, other=0) | |
tmp8 = tl.load(in_ptr1 + (x0), xmask) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp4 = 100352.0 | |
tmp5 = tmp3 / tmp4 | |
tmp6 = 0.1 | |
tmp7 = tmp5 * tmp6 | |
tmp9 = 0.9 | |
tmp10 = tmp8 * tmp9 | |
tmp11 = tmp7 + tmp10 | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((1, 56, 1, 1, 13), (728, 1, 728, 728, 56), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_69.run(*args, 56, 13, grid=grid(56), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_69.benchmark_all_configs(*args, 56, 13, grid=grid(56)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/n3/cn3m3v5dwzlicmheht7t2sqzrzgtbufmioizlp6yemcka6kewuok.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => convert_element_type_54, var_mean_11 | |
triton_red_fused__native_batch_norm_legit_functional_70 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[1024, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_70(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 728 | |
rnumel = 7720 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x1 = (xindex // 56) | |
x0 = xindex % 56 | |
_tmp9 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = r2 + (7720*x1) | |
tmp1 = 100352 | |
tmp2 = tmp0 < tmp1 | |
tmp3 = tl.load(in_ptr0 + ((784*x0) + (43904*(((r2 + (7720*x1)) // 784) % 128)) + ((r2 + (7720*x1)) % 784) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp4 = tmp3.to(tl.float32) | |
tmp5 = tl.load(in_ptr1 + (x0 + tl.zeros([XBLOCK, RBLOCK], tl.int32)), rmask & tmp2 & xmask, eviction_policy='evict_last', other=0) | |
tmp6 = tmp4 - tmp5 | |
tmp7 = tmp6 * tmp6 | |
tmp8 = tl.where(tmp2, tmp7, 0) | |
_tmp9 = tl.where(rmask & xmask, _tmp9 + tmp8, _tmp9) | |
tmp9 = tl.sum(_tmp9, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp9, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 56, 1, 1, 13), (728, 1, 728, 728, 56), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_70.run(*args, 728, 7720, grid=grid(728), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_70.benchmark_all_configs(*args, 728, 7720, grid=grid(728)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/xv/cxvq5i34k2bihobv4ebll2eevqva3mot7bow45ecoiyqxocbuesz.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_58, add_60, convert_element_type_54, mul_84, mul_85, mul_86, rsqrt_11, squeeze_34, var_mean_11 | |
triton_per_fused__native_batch_norm_legit_functional_71 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[64, 16], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_71(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 56 | |
rnumel = 13 | |
RBLOCK: tl.constexpr = 16 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (56*r1)), rmask & xmask, other=0) | |
tmp13 = tl.load(in_ptr1 + (x0), xmask) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp4 = 100352.0 | |
tmp5 = tmp3 / tmp4 | |
tmp6 = 1e-05 | |
tmp7 = tmp5 + tmp6 | |
tmp8 = tl.math.rsqrt(tmp7) | |
tmp9 = 1.00000996502277 | |
tmp10 = tmp5 * tmp9 | |
tmp11 = 0.1 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = 0.9 | |
tmp15 = tmp13 * tmp14 | |
tmp16 = tmp12 + tmp15 | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask) | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask) | |
tl.store(out_ptr0 + x0, tmp3, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1, 56, 1, 1, 13), (728, 1, 728, 728, 56), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_71.run(*args, 56, 13, grid=grid(56), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_71.benchmark_all_configs(*args, 56, 13, grid=grid(56)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/jw/cjwfycxx2mrgx2fsw6tfls7yyigmo7hmavgpkr2ob23aopbzsnjc.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_58, add_61, convert_element_type_54, convert_element_type_55, mul_81, mul_87, rsqrt_11, sub_11, var_mean_11 | |
triton_poi_fused__native_batch_norm_legit_functional_72 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_72(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 5619712 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 784) % 56 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x1), None) | |
tmp4 = tl.load(in_ptr2 + (x1), None) | |
tmp11 = tl.load(in_ptr3 + (x1), None) | |
tmp13 = tl.load(in_ptr4 + (x1), None) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 100352.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_72.run(*args, 5619712, grid=grid(5619712), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_72.benchmark_all_configs(*args, 5619712, grid=grid(5619712)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/7e/c7eyndtglwpd6eg2g74cubgx2m45rn7sikd7mb4t4ucitntos24j.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_56 | |
triton_poi_fused__to_copy_73 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_73(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 4704 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_73.run(*args, 4704, grid=grid(4704), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_73.benchmark_all_configs(*args, 4704, grid=grid(4704)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/g7/cg7zj3gxcfnefxp4hvh24f2ahem4qdf4cwmaspxgokny23kr7rdc.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_6 | |
triton_poi_fused_cat_74 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_74(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 16859136 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 131712 | |
x1 = (xindex // 131712) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (263424*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 168, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_74.run(*args, 16859136, grid=grid(16859136), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_74.benchmark_all_configs(*args, 16859136, grid=grid(16859136)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ea/cea2p3bterhsxcizlwjuqgqdcdagptpjutjfoisqqtcxx4bo46r2.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_63, add_64, add_65, convert_element_type_58, mul_89, mul_90, mul_91, mul_92, mul_93, rsqrt_12, squeeze_37, var_mean_12 | |
triton_red_fused__native_batch_norm_legit_functional_75 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[512, 131072], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_75(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 336 | |
rnumel = 100352 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 784 | |
r2 = (rindex // 784) | |
tmp0 = tl.load(in_ptr0 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
tmp2 = tl.sum(_tmp2, 1)[:, None] | |
tmp7 = tl.load(in_ptr1 + (x0), xmask) | |
tmp3 = 100352.0 | |
tmp4 = tmp2 / tmp3 | |
tmp5 = 0.1 | |
tmp6 = tmp4 * tmp5 | |
tmp8 = 0.9 | |
tmp9 = tmp7 * tmp8 | |
tmp10 = tmp6 + tmp9 | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask) | |
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 784 | |
r2 = (rindex // 784) | |
tmp11 = tl.load(in_ptr0 + (r1 + (784*x0) + (263424*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp12 = tmp11.to(tl.float32) | |
tmp13 = tmp12 - tmp4 | |
tmp14 = tmp13 * tmp13 | |
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15) | |
tmp15 = tl.sum(_tmp15, 1)[:, None] | |
tl.store(out_ptr1 + x0, tmp15, xmask) | |
tmp25 = tl.load(in_ptr2 + (x0), xmask) | |
tmp16 = 100352.0 | |
tmp17 = tmp15 / tmp16 | |
tmp18 = 1e-05 | |
tmp19 = tmp17 + tmp18 | |
tmp20 = tl.math.rsqrt(tmp19) | |
tmp21 = 1.00000996502277 | |
tmp22 = tmp17 * tmp21 | |
tmp23 = 0.1 | |
tmp24 = tmp22 * tmp23 | |
tmp26 = 0.9 | |
tmp27 = tmp25 * tmp26 | |
tmp28 = tmp24 + tmp27 | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask) | |
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_75.run(*args, 336, 100352, grid=grid(336), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_75.benchmark_all_configs(*args, 336, 100352, grid=grid(336)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/3o/c3odnwpkqtnjm6osw5x7eu5soiuoiv27n2m7cqobgwum2f4c5rgk.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub | |
# aten._native_batch_norm_legit_functional => add_63, add_66, convert_element_type_58, convert_element_type_59, mul_88, mul_94, rsqrt_12, sub_12, var_mean_12 | |
# aten.add => add_374 | |
# aten.clone => clone_3 | |
# aten.fill => full_like_44 | |
# aten.mul => mul_1067, mul_1068 | |
# aten.sigmoid => sigmoid_108 | |
# aten.sub => sub_297 | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 33718272 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 784) % 336 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x1), None) | |
tmp4 = tl.load(in_ptr2 + (x1), None) | |
tmp11 = tl.load(in_ptr3 + (x1), None) | |
tmp13 = tl.load(in_ptr4 + (x1), None) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 100352.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tmp16 = tl.sigmoid(tmp15) | |
tmp17 = 1.0 | |
tmp18 = tmp17 - tmp16 | |
tmp19 = tmp15 * tmp18 | |
tmp20 = tmp19 + tmp17 | |
tmp21 = tmp16 * tmp20 | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None) | |
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp21, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_6 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76.run(*args, 33718272, grid=grid(33718272), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76.benchmark_all_configs(*args, 33718272, grid=grid(33718272)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/nw/cnwbbjysyivhwwej3cvtzavdj6jvinxuti5ndhcg3qzuf77glcrt.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_62 | |
triton_poi_fused__to_copy_77 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_77(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1512 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((168, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((168, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_77.run(*args, 1512, grid=grid(1512), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_77.benchmark_all_configs(*args, 1512, grid=grid(1512)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/fz/cfzmnx3o626lf5s63xg32x6is4tmygw4nxdhdigpofuif52vxrzm.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_78 | |
triton_poi_fused_split_with_sizes_78 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_78(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 16859136 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 131712 | |
x1 = (xindex // 131712) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (263424*x1)), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_78.run(*args, 16859136, grid=grid(16859136), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_78.benchmark_all_configs(*args, 16859136, grid=grid(16859136)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/2r/c2rv7mzu7my2opxvcm4z3utkh7llovacnk6pwuzns4ehc3sgey2e.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_63 | |
triton_poi_fused__to_copy_79 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_79(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 4200 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((168, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((168, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_79.run(*args, 4200, grid=grid(4200), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_79.benchmark_all_configs(*args, 4200, grid=grid(4200)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/pt/cptqmg7k6n2us7i5wywz2bwg6fjwqyaknishdkved5rkdpml5nw5.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_81 | |
triton_poi_fused_split_with_sizes_80 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_80(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 16859136 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 131712 | |
x1 = (xindex // 131712) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (131712 + x0 + (263424*x1)), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_80.run(*args, 16859136, grid=grid(16859136), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_80.benchmark_all_configs(*args, 16859136, grid=grid(16859136)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/jj/cjj6wuzjfwt7xigucunb4hdetjt4mr4ldxzq5ab4kxddbetbjdu4.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu | |
# aten._native_batch_norm_legit_functional => add_68, add_71, convert_element_type_64, convert_element_type_65, mul_102, mul_96, rsqrt_13, sub_13, var_mean_13 | |
# aten.mean => mean_1 | |
# aten.silu => convert_element_type_66, convert_element_type_67, mul_103, sigmoid_5 | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_81 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[65536, 1024], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_mean_silu_81(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 43008 | |
rnumel = 784 | |
RBLOCK: tl.constexpr = 1024 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r2 = rindex | |
x3 = xindex | |
x0 = xindex % 336 | |
tmp0 = tl.load(in_ptr0 + (r2 + (784*x3)), rmask, other=0).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x0), None) | |
tmp4 = tl.load(in_ptr2 + (x0), None) | |
tmp11 = tl.load(in_ptr3 + (x0), None) | |
tmp13 = tl.load(in_ptr4 + (x0), None) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 100352.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tmp16 = tmp15.to(tl.float32) | |
tmp17 = tl.sigmoid(tmp16) | |
tmp18 = tmp16 * tmp17 | |
tmp19 = tmp18.to(tl.float32) | |
tmp20 = tmp19.to(tl.float32) | |
tmp22 = tl.where(rmask, tmp20, 0) | |
tmp23 = tl.sum(tmp22, 1)[:, None] | |
tmp24 = 784.0 | |
tmp25 = tmp23 / tmp24 | |
tmp26 = tmp25.to(tl.float32) | |
tl.store(out_ptr0 + (r2 + (784*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask) | |
tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_6 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_81.run(*args, 43008, 784, grid=grid(43008), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_mean_silu_81.benchmark_all_configs(*args, 43008, 784, grid=grid(43008)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/qi/cqigsniw4k2557tsg6szy4sxc76yhclqkbczo3sje6z3gkbkxim2.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_69 | |
triton_poi_fused__to_copy_82 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_82(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 9408 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((28, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((28, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_82.run(*args, 9408, grid=grid(9408), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_82.benchmark_all_configs(*args, 9408, grid=grid(9408)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/qr/cqrzuzcrjaui22653nognmjx7khvfhrb54plybvtrgjuggwhd7ca.py | |
# Original ATen: aten._to_copy, aten.convolution | |
# aten._to_copy => convert_element_type_68 | |
# aten.convolution => convolution_27 | |
triton_poi_fused__to_copy_convolution_83 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[32], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_convolution_83(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 28 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((28,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((28,), (1,), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_convolution_83.run(*args, 28, grid=grid(28), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_convolution_83.benchmark_all_configs(*args, 28, grid=grid(28)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/5j/c5jenet4bcjhmp6nmplvispvr7rwhrt4fonprb4atbj2oqpb6yhc.py | |
# Original ATen: aten._to_copy, aten.convolution, aten.silu | |
# aten._to_copy => convert_element_type_68 | |
# aten.convolution => convolution_27 | |
# aten.silu => convert_element_type_70, convert_element_type_71, mul_104, sigmoid_6 | |
triton_poi_fused__to_copy_convolution_silu_84 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_convolution_silu_84(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3584 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 28 | |
tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32) | |
tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp2 = tmp0 + tmp1 | |
tmp3 = tmp2.to(tl.float32) | |
tmp4 = tl.sigmoid(tmp3) | |
tmp5 = tmp3 * tmp4 | |
tmp6 = tmp5.to(tl.float32) | |
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((28,), (1,), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_convolution_silu_84.run(*args, 3584, grid=grid(3584), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_convolution_silu_84.benchmark_all_configs(*args, 3584, grid=grid(3584)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/6q/c6qndeyorjgk72ipb7s7xexoyu3w64lakoglh4m45doekpaj4mk5.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_73 | |
triton_poi_fused__to_copy_85 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_85(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 9408 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((336, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((336, 28, 1, 1), (28, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_85.run(*args, 9408, grid=grid(9408), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_85.benchmark_all_configs(*args, 9408, grid=grid(9408)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/e4/ce4gppgjp566e4o7bpfsqwx6ehxfdqd5kcqjzpnjkl7lictvy34j.py | |
# Original ATen: aten._to_copy, aten.convolution | |
# aten._to_copy => convert_element_type_72 | |
# aten.convolution => convolution_28 | |
triton_poi_fused__to_copy_convolution_86 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[512], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_convolution_86(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 336 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_convolution_86.run(*args, 336, grid=grid(336), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_convolution_86.benchmark_all_configs(*args, 336, grid=grid(336)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/6n/c6nqtrrohcuy7rc3a6jzkibcrswmbwqy32tjb2t44xhm5lv3c7ca.py | |
# Original ATen: aten._to_copy, aten.convolution | |
# aten._to_copy => convert_element_type_72 | |
# aten.convolution => convolution_28 | |
triton_poi_fused__to_copy_convolution_87 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_convolution_87(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 43008 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 336 | |
tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32) | |
tmp2 = tmp0 + tmp1 | |
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_convolution_87.run(*args, 43008, grid=grid(43008), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_convolution_87.benchmark_all_configs(*args, 43008, grid=grid(43008)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/tg/ctg6frnekhgugkbeugokpbetskj6nj7dyov4tmhfv6akngrmr3zz.py | |
# Original ATen: aten.mul, aten.sigmoid, aten.silu | |
# aten.mul => mul_105 | |
# aten.sigmoid => sigmoid_7 | |
# aten.silu => convert_element_type_66, convert_element_type_67, mul_103, sigmoid_5 | |
triton_poi_fused_mul_sigmoid_silu_88 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[67108864], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_mul_sigmoid_silu_88(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 33718272 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x1 = (xindex // 784) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tmp6 = tl.sigmoid(tmp5) | |
tmp7 = tmp4 * tmp6 | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_mul_sigmoid_silu_88.run(*args, 33718272, grid=grid(33718272), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_mul_sigmoid_silu_88.benchmark_all_configs(*args, 33718272, grid=grid(33718272)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/uj/cujd4wpttqmud7zdg64vlcs4ef2kzahlsvrue6dsbdgqyeobae26.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_84 | |
triton_poi_fused_split_with_sizes_89 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_89(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 16859136 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 131712 | |
x1 = (xindex // 131712) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (263424*x1)), None).to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_89.run(*args, 16859136, grid=grid(16859136), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_89.benchmark_all_configs(*args, 16859136, grid=grid(16859136)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/f3/cf3fhfcwg6xxca6ynhdijefpawpp34qudotfwxc7l4ytcq3zjpuo.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_85 | |
triton_poi_fused_split_with_sizes_90 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_90(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 16859136 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 131712 | |
x1 = (xindex // 131712) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (131712 + x0 + (263424*x1)), None).to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 168, 28, 28), (131712, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_90.run(*args, 16859136, grid=grid(16859136), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_90.benchmark_all_configs(*args, 16859136, grid=grid(16859136)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/mu/cmu55cboklppuar2bfoeffop2b46e55c3whe4q3ppvxcrz2secws.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_74 | |
triton_poi_fused__to_copy_91 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_91(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 4704 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_91.run(*args, 4704, grid=grid(4704), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_91.benchmark_all_configs(*args, 4704, grid=grid(4704)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/p7/cp7uk2n6iplc4okgdyrcwjh6klja4hwa7hip53lwkxv2iafhns22.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_8 | |
triton_poi_fused_cat_92 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_92(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2809856 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 21952 | |
x1 = (xindex // 21952) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (43904*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 28, 28, 28), (21952, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 28, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_92.run(*args, 2809856, grid=grid(2809856), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_92.benchmark_all_configs(*args, 2809856, grid=grid(2809856)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/go/cgo47t53ri3q3acehvndov22nzujsisq6mopaip7tcxdzexe45pe.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add | |
# aten._native_batch_norm_legit_functional => add_73, add_76, convert_element_type_76, convert_element_type_77, mul_106, mul_112, rsqrt_14, sub_14, var_mean_14 | |
# aten.add => add_77 | |
triton_poi_fused__native_batch_norm_legit_functional_add_93 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_93(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 5619712 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 784) % 56 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x1), None) | |
tmp4 = tl.load(in_ptr2 + (x1), None) | |
tmp11 = tl.load(in_ptr3 + (x1), None) | |
tmp13 = tl.load(in_ptr4 + (x1), None) | |
tmp16 = tl.load(in_ptr5 + (x3), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 100352.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tmp17 = tmp15 + tmp16 | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp17, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((56,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_6 = rand_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_93.run(*args, 5619712, grid=grid(5619712), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_93.benchmark_all_configs(*args, 5619712, grid=grid(5619712)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/cm/ccmet4jdw3a22ehivtutado4ilowri7xe4wvz55tt3jrgkttrcxr.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_122 | |
triton_poi_fused__to_copy_94 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_94(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 18816 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((336, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((336, 56, 1, 1), (56, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_94.run(*args, 18816, grid=grid(18816), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_94.benchmark_all_configs(*args, 18816, grid=grid(18816)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/2z/c2zspsupbpoyi6v25rww6qizjecactzymkuzhmgwbksaajrrvebj.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_127 | |
triton_poi_fused__to_copy_95 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_95(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1008 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((112, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((112, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_95.run(*args, 1008, grid=grid(1008), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_95.benchmark_all_configs(*args, 1008, grid=grid(1008)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/vt/cvthhjpoxq63zz4mggqx2vqnhtco6vgp67jmycwwem5t4nuguc3d.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_125 | |
triton_poi_fused_split_with_sizes_96 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_96(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 11239424 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 87808 | |
x1 = (xindex // 87808) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (263424*x1)), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 112, 28, 28), (87808, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_96.run(*args, 11239424, grid=grid(11239424), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_96.benchmark_all_configs(*args, 11239424, grid=grid(11239424)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/67/c67f43otxwovlhoxcmgghbvuamebcise6w4hyt6kxev6rjigyz6t.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_128 | |
triton_poi_fused__to_copy_97 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_97(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2800 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((112, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((112, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_97.run(*args, 2800, grid=grid(2800), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_97.benchmark_all_configs(*args, 2800, grid=grid(2800)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/mp/cmpup3qxitd5cnywi6sc5ctsik3jls3ax4u6bwwg7l22muv3kqac.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_129 | |
triton_poi_fused_split_with_sizes_98 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_98(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 11239424 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 87808 | |
x1 = (xindex // 87808) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (87808 + x0 + (263424*x1)), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 112, 28, 28), (87808, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_98.run(*args, 11239424, grid=grid(11239424), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_98.benchmark_all_configs(*args, 11239424, grid=grid(11239424)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/hj/chjntoqpyuayr43gwoduaxa3dcaerfocp4evoj27g5z63hwjt7yd.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_129 | |
triton_poi_fused__to_copy_99 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_99(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 5488 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((112, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((112, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_99.run(*args, 5488, grid=grid(5488), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_99.benchmark_all_configs(*args, 5488, grid=grid(5488)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/5y/c5yntu5zez2ni5e2qpl6ynioug5zthjmijm3yvywigtt4blzf6l6.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_133 | |
triton_poi_fused_split_with_sizes_100 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_100(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 11239424 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 87808 | |
x1 = (xindex // 87808) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (175616 + x0 + (263424*x1)), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 112, 28, 28), (87808, 784, 28, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_100.run(*args, 11239424, grid=grid(11239424), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_100.benchmark_all_configs(*args, 11239424, grid=grid(11239424)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/hi/chiefsb3dple2qr2zd5ph5efd2rhpwu722ib7qhnwdnwgxusl6o7.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_15 | |
triton_poi_fused_cat_101 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_101(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2809856 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 21952 | |
x1 = (xindex // 21952) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (65856*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 112, 14, 14), (21952, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 112, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_101.run(*args, 2809856, grid=grid(2809856), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_101.benchmark_all_configs(*args, 2809856, grid=grid(2809856)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/hj/chj56tyq633bknygfo23skutbtotcilbxvxls3o3esvqkeosnqzu.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_116, add_117, add_118, convert_element_type_130, mul_172, mul_173, mul_174, mul_175, mul_176, rsqrt_22, squeeze_67, var_mean_22 | |
triton_red_fused__native_batch_norm_legit_functional_102 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[512, 32768], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_102(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 336 | |
rnumel = 25088 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 196 | |
r2 = (rindex // 196) | |
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (65856*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
tmp2 = tl.sum(_tmp2, 1)[:, None] | |
tmp7 = tl.load(in_ptr1 + (x0), xmask) | |
tmp3 = 25088.0 | |
tmp4 = tmp2 / tmp3 | |
tmp5 = 0.1 | |
tmp6 = tmp4 * tmp5 | |
tmp8 = 0.9 | |
tmp9 = tmp7 * tmp8 | |
tmp10 = tmp6 + tmp9 | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask) | |
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 196 | |
r2 = (rindex // 196) | |
tmp11 = tl.load(in_ptr0 + (r1 + (196*x0) + (65856*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp12 = tmp11.to(tl.float32) | |
tmp13 = tmp12 - tmp4 | |
tmp14 = tmp13 * tmp13 | |
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15) | |
tmp15 = tl.sum(_tmp15, 1)[:, None] | |
tl.store(out_ptr1 + x0, tmp15, xmask) | |
tmp25 = tl.load(in_ptr2 + (x0), xmask) | |
tmp16 = 25088.0 | |
tmp17 = tmp15 / tmp16 | |
tmp18 = 1e-05 | |
tmp19 = tmp17 + tmp18 | |
tmp20 = tl.math.rsqrt(tmp19) | |
tmp21 = 1.0000398612827361 | |
tmp22 = tmp17 * tmp21 | |
tmp23 = 0.1 | |
tmp24 = tmp22 * tmp23 | |
tmp26 = 0.9 | |
tmp27 = tmp25 * tmp26 | |
tmp28 = tmp24 + tmp27 | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask) | |
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_102.run(*args, 336, 25088, grid=grid(336), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_102.benchmark_all_configs(*args, 336, 25088, grid=grid(336)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/zj/czji7shyj4kk6bl2rcdf5euq7ku4vpm3paub7sdz5rtqhc3o2xxw.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu | |
# aten._native_batch_norm_legit_functional => add_116, add_119, convert_element_type_130, convert_element_type_131, mul_171, mul_177, rsqrt_22, sub_22, var_mean_22 | |
# aten.mean => mean_4 | |
# aten.silu => convert_element_type_132, convert_element_type_133, mul_178, sigmoid_17 | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_103 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[65536, 256], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_mean_silu_103(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 43008 | |
rnumel = 196 | |
RBLOCK: tl.constexpr = 256 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r2 = rindex | |
x3 = xindex | |
x0 = xindex % 336 | |
tmp0 = tl.load(in_ptr0 + (r2 + (196*x3)), rmask, other=0).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x0), None) | |
tmp4 = tl.load(in_ptr2 + (x0), None) | |
tmp11 = tl.load(in_ptr3 + (x0), None) | |
tmp13 = tl.load(in_ptr4 + (x0), None) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 25088.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tmp16 = tmp15.to(tl.float32) | |
tmp17 = tl.sigmoid(tmp16) | |
tmp18 = tmp16 * tmp17 | |
tmp19 = tmp18.to(tl.float32) | |
tmp20 = tmp19.to(tl.float32) | |
tmp22 = tl.where(rmask, tmp20, 0) | |
tmp23 = tl.sum(tmp22, 1)[:, None] | |
tmp24 = 196.0 | |
tmp25 = tmp23 / tmp24 | |
tmp26 = tmp25.to(tl.float32) | |
tl.store(out_ptr0 + (r2 + (196*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask) | |
tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((336,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_6 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_103.run(*args, 43008, 196, grid=grid(43008), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_mean_silu_103.benchmark_all_configs(*args, 43008, 196, grid=grid(43008)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/vd/cvdkhtrpiolmsts2zro46mftpapkihzk36p6zhnrps3p4cq3krmr.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_135 | |
triton_poi_fused__to_copy_104 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_104(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 4704 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((14, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((14, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_104.run(*args, 4704, grid=grid(4704), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_104.benchmark_all_configs(*args, 4704, grid=grid(4704)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/tr/ctr74i6h7fbsohivhhoxxpx2ftnqmpxexofhzvpjxs3qwsbp6lt2.py | |
# Original ATen: aten._to_copy, aten.convolution | |
# aten._to_copy => convert_element_type_134 | |
# aten.convolution => convolution_51 | |
triton_poi_fused__to_copy_convolution_105 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_convolution_105(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 14 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((14,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((14,), (1,), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_convolution_105.run(*args, 14, grid=grid(14), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_convolution_105.benchmark_all_configs(*args, 14, grid=grid(14)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ko/ckojvusnljgriux3ampytpmwhilvjdtesa7avk7kubgd6p562p34.py | |
# Original ATen: aten._to_copy, aten.convolution, aten.silu | |
# aten._to_copy => convert_element_type_134 | |
# aten.convolution => convolution_51 | |
# aten.silu => convert_element_type_136, convert_element_type_137, mul_179, sigmoid_18 | |
triton_poi_fused__to_copy_convolution_silu_106 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_convolution_silu_106(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1792 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 14 | |
tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32) | |
tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp2 = tmp0 + tmp1 | |
tmp3 = tmp2.to(tl.float32) | |
tmp4 = tl.sigmoid(tmp3) | |
tmp5 = tmp3 * tmp4 | |
tmp6 = tmp5.to(tl.float32) | |
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((14,), (1,), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_convolution_silu_106.run(*args, 1792, grid=grid(1792), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_convolution_silu_106.benchmark_all_configs(*args, 1792, grid=grid(1792)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/wd/cwdr2zuaja73on5ahrizfe4fw5rmvg66qvnjezuugxyt6mvgktuo.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_139 | |
triton_poi_fused__to_copy_107 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_107(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 4704 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((336, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((336, 14, 1, 1), (14, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_107.run(*args, 4704, grid=grid(4704), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_107.benchmark_all_configs(*args, 4704, grid=grid(4704)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/i4/ci4cud6xszhpzrrtirjqfbkilqlzpxjq4f4rcbjmyxesmegdihcj.py | |
# Original ATen: aten.mul, aten.sigmoid, aten.silu | |
# aten.mul => mul_180 | |
# aten.sigmoid => sigmoid_19 | |
# aten.silu => convert_element_type_132, convert_element_type_133, mul_178, sigmoid_17 | |
triton_poi_fused_mul_sigmoid_silu_108 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_mul_sigmoid_silu_108(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 8429568 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x1 = (xindex // 196) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tmp6 = tl.sigmoid(tmp5) | |
tmp7 = tmp4 * tmp6 | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_mul_sigmoid_silu_108.run(*args, 8429568, grid=grid(8429568), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_mul_sigmoid_silu_108.benchmark_all_configs(*args, 8429568, grid=grid(8429568)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/am/camchutakzc7ufjmewydlcbxzwspii4jktvvytxzvdtkijmunoki.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_140 | |
triton_poi_fused__to_copy_109 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_109(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 34944 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((104, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((104, 336, 1, 1), (336, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_109.run(*args, 34944, grid=grid(34944), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_109.benchmark_all_configs(*args, 34944, grid=grid(34944)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/yx/cyxdjlki3rk4s5ktbskifaxqivcqtfxhzou2fgeig544h2budbbn.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => convert_element_type_141, var_mean_23 | |
triton_red_fused__native_batch_norm_legit_functional_110 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[512, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_110(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 416 | |
rnumel = 6272 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex % 104 | |
x1 = (xindex // 104) | |
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = tl.load(in_ptr0 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
tmp2 = tl.sum(_tmp2, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp2, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 104, 1, 1, 4), (416, 1, 416, 416, 104), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_110.run(*args, 416, 6272, grid=grid(416), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_110.benchmark_all_configs(*args, 416, 6272, grid=grid(416)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ns/cnsyz4u2t4xdpizytxz7kuybbnglpm544k5kmwi5e6mw37eoeeyp.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_122, convert_element_type_141, mul_182, mul_183, var_mean_23 | |
triton_per_fused__native_batch_norm_legit_functional_111 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[128, 4], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_111(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 104 | |
rnumel = 4 | |
RBLOCK: tl.constexpr = 4 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (104*r1)), rmask & xmask, other=0) | |
tmp8 = tl.load(in_ptr1 + (x0), xmask) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp4 = 25088.0 | |
tmp5 = tmp3 / tmp4 | |
tmp6 = 0.1 | |
tmp7 = tmp5 * tmp6 | |
tmp9 = 0.9 | |
tmp10 = tmp8 * tmp9 | |
tmp11 = tmp7 + tmp10 | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((1, 104, 1, 1, 4), (416, 1, 416, 416, 104), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_111.run(*args, 104, 4, grid=grid(104), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_111.benchmark_all_configs(*args, 104, 4, grid=grid(104)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/vx/cvx6izhz5yqxyjo6cfzkguw47brvifvvxn3jp6fmmqpamdr6rdo7.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => convert_element_type_141, var_mean_23 | |
triton_red_fused__native_batch_norm_legit_functional_112 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[512, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_112(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 416 | |
rnumel = 6272 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex % 104 | |
x1 = (xindex // 104) | |
tmp2 = tl.load(in_ptr1 + (x0), xmask) | |
_tmp5 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = tl.load(in_ptr0 + ((196*x0) + (20384*(r2 // 196)) + (652288*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp4 = tmp3 * tmp3 | |
_tmp5 = tl.where(rmask & xmask, _tmp5 + tmp4, _tmp5) | |
tmp5 = tl.sum(_tmp5, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp5, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 104, 1, 1, 4), (416, 1, 416, 416, 104), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_112.run(*args, 416, 6272, grid=grid(416), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_112.benchmark_all_configs(*args, 416, 6272, grid=grid(416)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/cy/ccy7dhumtl6nx7vjrgetby6qbjqdcsh4fen2e264olgvvilit6pl.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_121, add_123, convert_element_type_141, mul_184, mul_185, mul_186, rsqrt_23, squeeze_70, var_mean_23 | |
triton_per_fused__native_batch_norm_legit_functional_113 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[128, 4], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_113(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 104 | |
rnumel = 4 | |
RBLOCK: tl.constexpr = 4 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (104*r1)), rmask & xmask, other=0) | |
tmp13 = tl.load(in_ptr1 + (x0), xmask) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp4 = 25088.0 | |
tmp5 = tmp3 / tmp4 | |
tmp6 = 1e-05 | |
tmp7 = tmp5 + tmp6 | |
tmp8 = tl.math.rsqrt(tmp7) | |
tmp9 = 1.0000398612827361 | |
tmp10 = tmp5 * tmp9 | |
tmp11 = 0.1 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = 0.9 | |
tmp15 = tmp13 * tmp14 | |
tmp16 = tmp12 + tmp15 | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask) | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask) | |
tl.store(out_ptr0 + x0, tmp3, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1, 104, 1, 1, 4), (416, 1, 416, 416, 104), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_113.run(*args, 104, 4, grid=grid(104), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_113.benchmark_all_configs(*args, 104, 4, grid=grid(104)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/xg/cxgqusw3xzpsm5ih6qyxh7qdzmvcnjj2evihvlzncrorcyi7altv.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_121, add_124, convert_element_type_141, convert_element_type_142, mul_181, mul_187, rsqrt_23, sub_23, var_mean_23 | |
triton_poi_fused__native_batch_norm_legit_functional_114 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_114(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2609152 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 196) % 104 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x1), None) | |
tmp4 = tl.load(in_ptr2 + (x1), None) | |
tmp11 = tl.load(in_ptr3 + (x1), None) | |
tmp13 = tl.load(in_ptr4 + (x1), None) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 25088.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_114.run(*args, 2609152, grid=grid(2609152), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_114.benchmark_all_configs(*args, 2609152, grid=grid(2609152)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/hr/chrygikuwep25dgozsipotfnih6lajyye2txuo4r7put7ywkdk3t.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_143 | |
triton_poi_fused__to_copy_115 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_115(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 16224 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_115.run(*args, 16224, grid=grid(16224), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_115.benchmark_all_configs(*args, 16224, grid=grid(16224)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ca/ccaahgmwlqiraz4ivxpktecmbqhvntosz4cdfe5ghfjm4pe6pws6.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_16 | |
triton_poi_fused_cat_116 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_116(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 7827456 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 61152 | |
x1 = (xindex // 61152) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (122304*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 312, 14, 14), (61152, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 312, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_116.run(*args, 7827456, grid=grid(7827456), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_116.benchmark_all_configs(*args, 7827456, grid=grid(7827456)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/j3/cj3tf4apq2n25yzonidfbwyd3cn3jiyqodrw5hjo73xovhn5czvu.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_126, add_127, add_128, convert_element_type_145, mul_189, mul_190, mul_191, mul_192, mul_193, rsqrt_24, squeeze_73, var_mean_24 | |
triton_red_fused__native_batch_norm_legit_functional_117 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[1024, 32768], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_117(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 624 | |
rnumel = 25088 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 196 | |
r2 = (rindex // 196) | |
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
tmp2 = tl.sum(_tmp2, 1)[:, None] | |
tmp7 = tl.load(in_ptr1 + (x0), xmask) | |
tmp3 = 25088.0 | |
tmp4 = tmp2 / tmp3 | |
tmp5 = 0.1 | |
tmp6 = tmp4 * tmp5 | |
tmp8 = 0.9 | |
tmp9 = tmp7 * tmp8 | |
tmp10 = tmp6 + tmp9 | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask) | |
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 196 | |
r2 = (rindex // 196) | |
tmp11 = tl.load(in_ptr0 + (r1 + (196*x0) + (122304*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp12 = tmp11.to(tl.float32) | |
tmp13 = tmp12 - tmp4 | |
tmp14 = tmp13 * tmp13 | |
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15) | |
tmp15 = tl.sum(_tmp15, 1)[:, None] | |
tl.store(out_ptr1 + x0, tmp15, xmask) | |
tmp25 = tl.load(in_ptr2 + (x0), xmask) | |
tmp16 = 25088.0 | |
tmp17 = tmp15 / tmp16 | |
tmp18 = 1e-05 | |
tmp19 = tmp17 + tmp18 | |
tmp20 = tl.math.rsqrt(tmp19) | |
tmp21 = 1.0000398612827361 | |
tmp22 = tmp17 * tmp21 | |
tmp23 = 0.1 | |
tmp24 = tmp22 * tmp23 | |
tmp26 = 0.9 | |
tmp27 = tmp25 * tmp26 | |
tmp28 = tmp24 + tmp27 | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask) | |
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_117.run(*args, 624, 25088, grid=grid(624), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_117.benchmark_all_configs(*args, 624, 25088, grid=grid(624)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/yk/cyk2njvgthyqkaax3hrogqqjnq7t4z4h2bdza6wc2nyqhlzwiqur.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub | |
# aten._native_batch_norm_legit_functional => add_126, add_129, convert_element_type_145, convert_element_type_146, mul_188, mul_194, rsqrt_24, sub_24, var_mean_24 | |
# aten.add => add_355 | |
# aten.clone => clone_15 | |
# aten.fill => full_like_32 | |
# aten.mul => mul_907, mul_908 | |
# aten.sigmoid => sigmoid_96 | |
# aten.sub => sub_233 | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_118 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_118(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 15654912 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 196) % 624 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x1), None) | |
tmp4 = tl.load(in_ptr2 + (x1), None) | |
tmp11 = tl.load(in_ptr3 + (x1), None) | |
tmp13 = tl.load(in_ptr4 + (x1), None) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 25088.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tmp16 = tl.sigmoid(tmp15) | |
tmp17 = 1.0 | |
tmp18 = tmp17 - tmp16 | |
tmp19 = tmp15 * tmp18 | |
tmp20 = tmp19 + tmp17 | |
tmp21 = tmp16 * tmp20 | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None) | |
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp21, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_6 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_118.run(*args, 15654912, grid=grid(15654912), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_118.benchmark_all_configs(*args, 15654912, grid=grid(15654912)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/h4/ch4fzflvelcyupsdemcjv4coogzqeqjolfo3rcegkby4xe2z7cgp.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_149 | |
triton_poi_fused__to_copy_119 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_119(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1404 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((156, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((156, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_119.run(*args, 1404, grid=grid(1404), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_119.benchmark_all_configs(*args, 1404, grid=grid(1404)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ee/ceehri7sldlkmfyc53ikeryr64g77qkjawr2ddxdwntxstm4glbe.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_146 | |
triton_poi_fused_split_with_sizes_120 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_120(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3913728 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 30576 | |
x1 = (xindex // 30576) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (122304*x1)), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_120.run(*args, 3913728, grid=grid(3913728), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_120.benchmark_all_configs(*args, 3913728, grid=grid(3913728)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/3q/c3qy34jaqk4nnzsuou7s5dwcbuoi5kctwalhcspkfthqj2nvhjzq.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_150 | |
triton_poi_fused__to_copy_121 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_121(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3900 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((156, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((156, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_121.run(*args, 3900, grid=grid(3900), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_121.benchmark_all_configs(*args, 3900, grid=grid(3900)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/d2/cd2noiyayhlfce4vrflvnr5n745imwlcczcywnnle3qtso27avlv.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_151 | |
triton_poi_fused_split_with_sizes_122 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_122(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3913728 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 30576 | |
x1 = (xindex // 30576) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (30576 + x0 + (122304*x1)), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_122.run(*args, 3913728, grid=grid(3913728), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_122.benchmark_all_configs(*args, 3913728, grid=grid(3913728)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/on/conrh3mxhtsmg5nvtksxdme55y6bhfvwrrnpp6mhmwmulosn5uyv.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_151 | |
triton_poi_fused__to_copy_123 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_123(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 7644 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((156, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((156, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_123.run(*args, 7644, grid=grid(7644), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_123.benchmark_all_configs(*args, 7644, grid=grid(7644)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/nc/cncwy3tqtyzgd3muyfflcfni7qls4ttqyb4sac6cmg6znll2bcqu.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_156 | |
triton_poi_fused_split_with_sizes_124 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_124(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3913728 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 30576 | |
x1 = (xindex // 30576) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (61152 + x0 + (122304*x1)), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_124.run(*args, 3913728, grid=grid(3913728), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_124.benchmark_all_configs(*args, 3913728, grid=grid(3913728)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/yy/cyyxm3mybqiv5z5gofz4uhjttc6in5hwbrcbzt4xlp4z5lqyiqvr.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_152 | |
triton_poi_fused__to_copy_125 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_125(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 12636 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((156, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((156, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_125.run(*args, 12636, grid=grid(12636), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_125.benchmark_all_configs(*args, 12636, grid=grid(12636)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ta/ctar2nhvp2hnx6l3xel6gxwicmmw5nlhbt33l63xfadahaihh7qo.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_161 | |
triton_poi_fused_split_with_sizes_126 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_126(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3913728 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 30576 | |
x1 = (xindex // 30576) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (91728 + x0 + (122304*x1)), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_126.run(*args, 3913728, grid=grid(3913728), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_126.benchmark_all_configs(*args, 3913728, grid=grid(3913728)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/aa/caaej5ipewhg7u3hng5mw3omnmn72k6eam7aoq4mbk5yqdeq7wwb.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_17 | |
triton_poi_fused_cat_127 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_127(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3913728 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 30576 | |
x1 = (xindex // 30576) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (122304*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 156, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_127.run(*args, 3913728, grid=grid(3913728), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_127.benchmark_all_configs(*args, 3913728, grid=grid(3913728)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/xm/cxmjpqggq2cokpfycochmecokolhxsamegk6vtmem3x47oz5ss3r.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu | |
# aten._native_batch_norm_legit_functional => add_131, add_134, convert_element_type_153, convert_element_type_154, mul_196, mul_202, rsqrt_25, sub_25, var_mean_25 | |
# aten.mean => mean_5 | |
# aten.silu => convert_element_type_155, convert_element_type_156, mul_203, sigmoid_21 | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_128 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[131072, 256], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_mean_silu_128(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 79872 | |
rnumel = 196 | |
RBLOCK: tl.constexpr = 256 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r2 = rindex | |
x3 = xindex | |
x0 = xindex % 624 | |
tmp0 = tl.load(in_ptr0 + (r2 + (196*x3)), rmask, other=0).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x0), None) | |
tmp4 = tl.load(in_ptr2 + (x0), None) | |
tmp11 = tl.load(in_ptr3 + (x0), None) | |
tmp13 = tl.load(in_ptr4 + (x0), None) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 25088.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tmp16 = tmp15.to(tl.float32) | |
tmp17 = tl.sigmoid(tmp16) | |
tmp18 = tmp16 * tmp17 | |
tmp19 = tmp18.to(tl.float32) | |
tmp20 = tmp19.to(tl.float32) | |
tmp22 = tl.where(rmask, tmp20, 0) | |
tmp23 = tl.sum(tmp22, 1)[:, None] | |
tmp24 = 196.0 | |
tmp25 = tmp23 / tmp24 | |
tmp26 = tmp25.to(tl.float32) | |
tl.store(out_ptr0 + (r2 + (196*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask) | |
tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_6 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_128.run(*args, 79872, 196, grid=grid(79872), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_mean_silu_128.benchmark_all_configs(*args, 79872, 196, grid=grid(79872)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/5n/c5nuz3wivd2kux327s5ffdifgf3i65ukistqypmzli4ipcs5fhup.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_158 | |
triton_poi_fused__to_copy_129 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_129(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 16224 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((26, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((26, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_129.run(*args, 16224, grid=grid(16224), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_129.benchmark_all_configs(*args, 16224, grid=grid(16224)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/7u/c7uwxkw3ipfjfdehvzykp4imfsjz53vfoikfvt4ys7uwsknqbbb2.py | |
# Original ATen: aten._to_copy, aten.convolution | |
# aten._to_copy => convert_element_type_157 | |
# aten.convolution => convolution_60 | |
triton_poi_fused__to_copy_convolution_130 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[32], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_convolution_130(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 26 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((26,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((26,), (1,), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_convolution_130.run(*args, 26, grid=grid(26), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_convolution_130.benchmark_all_configs(*args, 26, grid=grid(26)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/qj/cqjs4oas7ss654pge5hzaweuxapq6ge5y5fvo2ygxf7r6qylld6m.py | |
# Original ATen: aten._to_copy, aten.convolution, aten.silu | |
# aten._to_copy => convert_element_type_157 | |
# aten.convolution => convolution_60 | |
# aten.silu => convert_element_type_159, convert_element_type_160, mul_204, sigmoid_22 | |
triton_poi_fused__to_copy_convolution_silu_131 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_convolution_silu_131(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3328 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 26 | |
tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32) | |
tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp2 = tmp0 + tmp1 | |
tmp3 = tmp2.to(tl.float32) | |
tmp4 = tl.sigmoid(tmp3) | |
tmp5 = tmp3 * tmp4 | |
tmp6 = tmp5.to(tl.float32) | |
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((26,), (1,), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_convolution_silu_131.run(*args, 3328, grid=grid(3328), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_convolution_silu_131.benchmark_all_configs(*args, 3328, grid=grid(3328)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/af/cafnwdqexuqbr3u7uonjmdp2h6gms65ppr5ersr7levxxzbxsags.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_162 | |
triton_poi_fused__to_copy_132 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_132(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 16224 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((624, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((624, 26, 1, 1), (26, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_132.run(*args, 16224, grid=grid(16224), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_132.benchmark_all_configs(*args, 16224, grid=grid(16224)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/qt/cqtzzxfgsd6elacosxssusc2qroqxqbum3szv62gh4gr6s5vzwmp.py | |
# Original ATen: aten._to_copy, aten.convolution | |
# aten._to_copy => convert_element_type_161 | |
# aten.convolution => convolution_61 | |
triton_poi_fused__to_copy_convolution_133 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_convolution_133(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 624 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_convolution_133.run(*args, 624, grid=grid(624), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_convolution_133.benchmark_all_configs(*args, 624, grid=grid(624)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/hd/chd7xmae5qozjrly7porkx7hggrm6fh6klnsko3rzc52ohjqiyou.py | |
# Original ATen: aten._to_copy, aten.convolution | |
# aten._to_copy => convert_element_type_161 | |
# aten.convolution => convolution_61 | |
triton_poi_fused__to_copy_convolution_134 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_convolution_134(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 79872 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 624 | |
tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32) | |
tmp2 = tmp0 + tmp1 | |
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_convolution_134.run(*args, 79872, grid=grid(79872), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_convolution_134.benchmark_all_configs(*args, 79872, grid=grid(79872)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/mn/cmnvsfbxocmtd6hcxhi5n75ltfyzsnegxzpgv7o57n6ksfhmrkay.py | |
# Original ATen: aten.mul, aten.sigmoid, aten.silu | |
# aten.mul => mul_205 | |
# aten.sigmoid => sigmoid_23 | |
# aten.silu => convert_element_type_155, convert_element_type_156, mul_203, sigmoid_21 | |
triton_poi_fused_mul_sigmoid_silu_135 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_mul_sigmoid_silu_135(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 15654912 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x1 = (xindex // 196) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tmp6 = tl.sigmoid(tmp5) | |
tmp7 = tmp4 * tmp6 | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_mul_sigmoid_silu_135.run(*args, 15654912, grid=grid(15654912), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_mul_sigmoid_silu_135.benchmark_all_configs(*args, 15654912, grid=grid(15654912)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/x6/cx65e4iuztiipxmj3kt6pdmtbqk6eud6jbchpllpc6tinjqjiko7.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_164 | |
triton_poi_fused_split_with_sizes_136 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_136(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 7827456 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 61152 | |
x1 = (xindex // 61152) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (122304*x1)), None).to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 312, 14, 14), (61152, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_136.run(*args, 7827456, grid=grid(7827456), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_136.benchmark_all_configs(*args, 7827456, grid=grid(7827456)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/la/cla6pjsd7zhhqofhjak5ak67vzu4puphrosszybe7he2qayusdf7.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_165 | |
triton_poi_fused_split_with_sizes_137 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_137(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 7827456 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 61152 | |
x1 = (xindex // 61152) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (61152 + x0 + (122304*x1)), None).to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 312, 14, 14), (61152, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_137.run(*args, 7827456, grid=grid(7827456), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_137.benchmark_all_configs(*args, 7827456, grid=grid(7827456)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/hz/chz3fw3o5fqe2cnzuiuggnyrnh3xzhhrw266gsgpwo3xgo4buev6.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_163 | |
triton_poi_fused__to_copy_138 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_138(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 16224 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_138.run(*args, 16224, grid=grid(16224), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_138.benchmark_all_configs(*args, 16224, grid=grid(16224)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/vk/cvkwpn3gas36vom5boko6m7yjnsu4h3jn3s2dj74g4puyptio4dc.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_18 | |
triton_poi_fused_cat_139 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_139(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1304576 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 10192 | |
x1 = (xindex // 10192) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (20384*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 52, 14, 14), (10192, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 52, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_139.run(*args, 1304576, grid=grid(1304576), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_139.benchmark_all_configs(*args, 1304576, grid=grid(1304576)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/qp/cqptss63kr7w22hqbaitci52teddi4haqxu6zws6bzlk7k3r4p5w.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add | |
# aten._native_batch_norm_legit_functional => add_136, add_139, convert_element_type_165, convert_element_type_166, mul_206, mul_212, rsqrt_26, sub_26, var_mean_26 | |
# aten.add => add_140 | |
triton_poi_fused__native_batch_norm_legit_functional_add_140 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_140(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2609152 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 196) % 104 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x1), None) | |
tmp4 = tl.load(in_ptr2 + (x1), None) | |
tmp11 = tl.load(in_ptr3 + (x1), None) | |
tmp13 = tl.load(in_ptr4 + (x1), None) | |
tmp16 = tl.load(in_ptr5 + (x3), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 25088.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tmp17 = tmp15 + tmp16 | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp17, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((104,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_6 = rand_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_140.run(*args, 2609152, grid=grid(2609152), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_140.benchmark_all_configs(*args, 2609152, grid=grid(2609152)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/zf/czfuu6dquzuudpwuu3bwtsd76hxju2ps43qrqh5npmmaayzjihi2.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_215 | |
triton_poi_fused__to_copy_141 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_141(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 64896 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((624, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((624, 104, 1, 1), (104, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_141.run(*args, 64896, grid=grid(64896), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_141.benchmark_all_configs(*args, 64896, grid=grid(64896)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/hj/chjvw2ozl3ipjwmejxog5ggykkaa6pfst7j432f3sbhnt2kvth4p.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.silu, aten.sub | |
# aten._native_batch_norm_legit_functional => add_174, add_177, convert_element_type_216, convert_element_type_217, mul_263, mul_269, rsqrt_33, sub_33, var_mean_33 | |
# aten.add => add_341 | |
# aten.clone => clone_24 | |
# aten.fill => full_like_23 | |
# aten.mul => mul_787, mul_788 | |
# aten.sigmoid => sigmoid_87 | |
# aten.silu => convert_element_type_218, convert_element_type_219, mul_270, sigmoid_32 | |
# aten.sub => sub_185 | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_silu_sub_142 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_silu_sub_142(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr1, out_ptr2, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 15654912 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 196) % 624 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x1), None) | |
tmp4 = tl.load(in_ptr2 + (x1), None) | |
tmp11 = tl.load(in_ptr3 + (x1), None) | |
tmp13 = tl.load(in_ptr4 + (x1), None) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 25088.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tmp16 = tmp15.to(tl.float32) | |
tmp17 = tl.sigmoid(tmp16) | |
tmp18 = tmp16 * tmp17 | |
tmp19 = tmp18.to(tl.float32) | |
tmp20 = tl.sigmoid(tmp15) | |
tmp21 = 1.0 | |
tmp22 = tmp21 - tmp20 | |
tmp23 = tmp15 * tmp22 | |
tmp24 = tmp23 + tmp21 | |
tmp25 = tmp20 * tmp24 | |
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp19, None) | |
tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp25, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((624,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_6 = rand_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_silu_sub_142.run(*args, 15654912, grid=grid(15654912), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_silu_sub_142.benchmark_all_configs(*args, 15654912, grid=grid(15654912)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/by/cbyy7pyyeky2s6spdugmwrwqdvrjfdr7qwcxwapwzhywwhng5ujh.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_220 | |
triton_poi_fused__to_copy_143 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_143(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 5616 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((624, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((624, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_143.run(*args, 5616, grid=grid(5616), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_143.benchmark_all_configs(*args, 5616, grid=grid(5616)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ji/cjiljcgh2wc2yighvqz76tkwzti3zjjxpno5dng2ammsgvfhwvv7.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_226 | |
triton_poi_fused__to_copy_144 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_144(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 32448 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((52, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((52, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_144.run(*args, 32448, grid=grid(32448), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_144.benchmark_all_configs(*args, 32448, grid=grid(32448)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/tv/ctvzn4xpxjd5g7oyd7nntvpnuyovmntivt2fyxf53c3mblh6dshr.py | |
# Original ATen: aten._to_copy, aten.convolution | |
# aten._to_copy => convert_element_type_225 | |
# aten.convolution => convolution_86 | |
triton_poi_fused__to_copy_convolution_145 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[64], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_convolution_145(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 52 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((52,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((52,), (1,), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_convolution_145.run(*args, 52, grid=grid(52), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_convolution_145.benchmark_all_configs(*args, 52, grid=grid(52)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/sf/csfcbn5kzqvvv6krkr7772b7z66pr6xsuxyhtbgglpb26gkfc5h3.py | |
# Original ATen: aten._to_copy, aten.convolution, aten.silu | |
# aten._to_copy => convert_element_type_225 | |
# aten.convolution => convolution_86 | |
# aten.silu => convert_element_type_227, convert_element_type_228, mul_279, sigmoid_34 | |
triton_poi_fused__to_copy_convolution_silu_146 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_convolution_silu_146(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 6656 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 52 | |
tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32) | |
tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp2 = tmp0 + tmp1 | |
tmp3 = tmp2.to(tl.float32) | |
tmp4 = tl.sigmoid(tmp3) | |
tmp5 = tmp3 * tmp4 | |
tmp6 = tmp5.to(tl.float32) | |
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((52,), (1,), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_convolution_silu_146.run(*args, 6656, grid=grid(6656), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_convolution_silu_146.benchmark_all_configs(*args, 6656, grid=grid(6656)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ik/cikbdagjlpyedk3af4it4ary3yx6g4gpfifz25vxc7r25q224d43.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_230 | |
triton_poi_fused__to_copy_147 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_147(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 32448 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((624, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((624, 52, 1, 1), (52, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_147.run(*args, 32448, grid=grid(32448), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_147.benchmark_all_configs(*args, 32448, grid=grid(32448)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/xb/cxb5poqew2wpvurnixpesahtvwzfiadakefnqud2hzokvcgnemog.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_231 | |
triton_poi_fused__to_copy_148 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_148(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 99840 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((160, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((160, 624, 1, 1), (624, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_148.run(*args, 99840, grid=grid(99840), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_148.benchmark_all_configs(*args, 99840, grid=grid(99840)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/z7/cz7tf27fdbr7sfmkjktkou2imnr2mnlt7hicdhvs4chclhrw6j37.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => convert_element_type_232, var_mean_35 | |
triton_red_fused__native_batch_norm_legit_functional_149 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[1024, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_149(in_ptr0, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 640 | |
rnumel = 6272 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex % 160 | |
x1 = (xindex // 160) | |
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = tl.load(in_ptr0 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
tmp2 = tl.sum(_tmp2, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp2, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 160, 1, 1, 4), (640, 1, 640, 640, 160), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_149.run(*args, 640, 6272, grid=grid(640), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_149.benchmark_all_configs(*args, 640, 6272, grid=grid(640)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/hi/chiuub4omrsfimb4p75xk4vlbwv5mhljjinvhtfc3kqhmfnsyq4q.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_185, convert_element_type_232, mul_282, mul_283, var_mean_35 | |
triton_per_fused__native_batch_norm_legit_functional_150 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[256, 4], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: 'i32', 5: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_150(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 160 | |
rnumel = 4 | |
RBLOCK: tl.constexpr = 4 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (160*r1)), rmask & xmask, other=0) | |
tmp8 = tl.load(in_ptr1 + (x0), xmask) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp4 = 25088.0 | |
tmp5 = tmp3 / tmp4 | |
tmp6 = 0.1 | |
tmp7 = tmp5 * tmp6 | |
tmp9 = 0.9 | |
tmp10 = tmp8 * tmp9 | |
tmp11 = tmp7 + tmp10 | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((1, 160, 1, 1, 4), (640, 1, 640, 640, 160), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_150.run(*args, 160, 4, grid=grid(160), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_150.benchmark_all_configs(*args, 160, 4, grid=grid(160)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/4b/c4bvtzglf3eflssbrfazd6mjdgmrhoujx4xq6fve2tfmkwyeseru.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => convert_element_type_232, var_mean_35 | |
triton_red_fused__native_batch_norm_legit_functional_151 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[1024, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_151(in_ptr0, in_ptr1, out_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 640 | |
rnumel = 6272 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex % 160 | |
x1 = (xindex // 160) | |
tmp2 = tl.load(in_ptr1 + (x0), xmask) | |
_tmp5 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
x3 = xindex | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r2 = rindex | |
tmp0 = tl.load(in_ptr0 + ((196*x0) + (31360*(r2 // 196)) + (1003520*x1) + (r2 % 196)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp4 = tmp3 * tmp3 | |
_tmp5 = tl.where(rmask & xmask, _tmp5 + tmp4, _tmp5) | |
tmp5 = tl.sum(_tmp5, 1)[:, None] | |
tl.store(out_ptr0 + x3, tmp5, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 160, 1, 1, 4), (640, 1, 640, 640, 160), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_151.run(*args, 640, 6272, grid=grid(640), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_151.benchmark_all_configs(*args, 640, 6272, grid=grid(640)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/vo/cvo4hmhbgzgrpmknfykmtsywf2s2dd4n7xkbf6ll6hxoj5qx3zny.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_184, add_186, convert_element_type_232, mul_284, mul_285, mul_286, rsqrt_35, squeeze_106, var_mean_35 | |
triton_per_fused__native_batch_norm_legit_functional_152 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[256, 4], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_152(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 160 | |
rnumel = 4 | |
RBLOCK: tl.constexpr = 4 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r1 = rindex | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (160*r1)), rmask & xmask, other=0) | |
tmp13 = tl.load(in_ptr1 + (x0), xmask) | |
tmp2 = tl.where(rmask & xmask, tmp0, 0) | |
tmp3 = tl.sum(tmp2, 1)[:, None] | |
tmp4 = 25088.0 | |
tmp5 = tmp3 / tmp4 | |
tmp6 = 1e-05 | |
tmp7 = tmp5 + tmp6 | |
tmp8 = tl.math.rsqrt(tmp7) | |
tmp9 = 1.0000398612827361 | |
tmp10 = tmp5 * tmp9 | |
tmp11 = 0.1 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = 0.9 | |
tmp15 = tmp13 * tmp14 | |
tmp16 = tmp12 + tmp15 | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp8, xmask) | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp16, xmask) | |
tl.store(out_ptr0 + x0, tmp3, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1, 160, 1, 1, 4), (640, 1, 640, 640, 160), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_152.run(*args, 160, 4, grid=grid(160), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_152.benchmark_all_configs(*args, 160, 4, grid=grid(160)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/7r/c7rznphkbqtbupvosh2bugs6qz6cx4gcbzlpwzvb44okdkqoc7am.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_184, add_187, convert_element_type_232, convert_element_type_233, mul_281, mul_287, rsqrt_35, sub_35, var_mean_35 | |
triton_poi_fused__native_batch_norm_legit_functional_153 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_153(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 4014080 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 196) % 160 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x1), None) | |
tmp4 = tl.load(in_ptr2 + (x1), None) | |
tmp11 = tl.load(in_ptr3 + (x1), None) | |
tmp13 = tl.load(in_ptr4 + (x1), None) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 25088.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_153.run(*args, 4014080, grid=grid(4014080), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_153.benchmark_all_configs(*args, 4014080, grid=grid(4014080)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/25/c25hctynyjlhient6ceczvttfs4npfgnfoty5zhk6pn4ckhqm3ka.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_234 | |
triton_poi_fused__to_copy_154 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_154(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 19200 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_154.run(*args, 19200, grid=grid(19200), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_154.benchmark_all_configs(*args, 19200, grid=grid(19200)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/yz/cyzjinfa2nwvymmtkqoovwkfypkotg6ek2sunwsuf3vfd35r7l3k.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_25 | |
triton_poi_fused_cat_155 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_155(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 6021120 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 47040 | |
x1 = (xindex // 47040) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (94080*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 240, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_155.run(*args, 6021120, grid=grid(6021120), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_155.benchmark_all_configs(*args, 6021120, grid=grid(6021120)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ks/cksikkrsggj4vc4g42rphhvyb3ox3tanqawsqpy2rpo5qbqc3tin.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_189, add_190, add_191, convert_element_type_236, mul_289, mul_290, mul_291, mul_292, mul_293, rsqrt_36, squeeze_109, var_mean_36 | |
triton_red_fused__native_batch_norm_legit_functional_156 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[512, 32768], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_156(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 480 | |
rnumel = 25088 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 196 | |
r2 = (rindex // 196) | |
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
tmp2 = tl.sum(_tmp2, 1)[:, None] | |
tmp7 = tl.load(in_ptr1 + (x0), xmask) | |
tmp3 = 25088.0 | |
tmp4 = tmp2 / tmp3 | |
tmp5 = 0.1 | |
tmp6 = tmp4 * tmp5 | |
tmp8 = 0.9 | |
tmp9 = tmp7 * tmp8 | |
tmp10 = tmp6 + tmp9 | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask) | |
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 196 | |
r2 = (rindex // 196) | |
tmp11 = tl.load(in_ptr0 + (r1 + (196*x0) + (94080*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp12 = tmp11.to(tl.float32) | |
tmp13 = tmp12 - tmp4 | |
tmp14 = tmp13 * tmp13 | |
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15) | |
tmp15 = tl.sum(_tmp15, 1)[:, None] | |
tl.store(out_ptr1 + x0, tmp15, xmask) | |
tmp25 = tl.load(in_ptr2 + (x0), xmask) | |
tmp16 = 25088.0 | |
tmp17 = tmp15 / tmp16 | |
tmp18 = 1e-05 | |
tmp19 = tmp17 + tmp18 | |
tmp20 = tl.math.rsqrt(tmp19) | |
tmp21 = 1.0000398612827361 | |
tmp22 = tmp17 * tmp21 | |
tmp23 = 0.1 | |
tmp24 = tmp22 * tmp23 | |
tmp26 = 0.9 | |
tmp27 = tmp25 * tmp26 | |
tmp28 = tmp24 + tmp27 | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask) | |
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_156.run(*args, 480, 25088, grid=grid(480), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_156.benchmark_all_configs(*args, 480, 25088, grid=grid(480)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/uq/cuq6a4x4agmkmfpm3znliwswlpgc26f3bnjc5ylnl5l6aeuugh6f.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub | |
# aten._native_batch_norm_legit_functional => add_189, add_192, convert_element_type_236, convert_element_type_237, mul_288, mul_294, rsqrt_36, sub_36, var_mean_36 | |
# aten.add => add_336 | |
# aten.clone => clone_27 | |
# aten.fill => full_like_20 | |
# aten.mul => mul_747, mul_748 | |
# aten.sigmoid => sigmoid_84 | |
# aten.sub => sub_169 | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_157 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_157(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 12042240 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 196) % 480 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x1), None) | |
tmp4 = tl.load(in_ptr2 + (x1), None) | |
tmp11 = tl.load(in_ptr3 + (x1), None) | |
tmp13 = tl.load(in_ptr4 + (x1), None) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 25088.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tmp16 = tl.sigmoid(tmp15) | |
tmp17 = 1.0 | |
tmp18 = tmp17 - tmp16 | |
tmp19 = tmp15 * tmp18 | |
tmp20 = tmp19 + tmp17 | |
tmp21 = tmp16 * tmp20 | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None) | |
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp21, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_6 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_157.run(*args, 12042240, grid=grid(12042240), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_157.benchmark_all_configs(*args, 12042240, grid=grid(12042240)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/hb/chbvxlmers2jwf43nczkvjub2nxsnhvu5csbgr2u6jwg4rlcdpzl.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_242 | |
triton_poi_fused_split_with_sizes_158 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_158(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3010560 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 23520 | |
x1 = (xindex // 23520) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (94080*x1)), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_158.run(*args, 3010560, grid=grid(3010560), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_158.benchmark_all_configs(*args, 3010560, grid=grid(3010560)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/cu/ccuyxkbjwe7ff4mqowk3q7to3cbjg3o7g2sh6doygm5y2bnlvlfi.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_241 | |
triton_poi_fused__to_copy_159 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_159(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3000 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_159.run(*args, 3000, grid=grid(3000), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_159.benchmark_all_configs(*args, 3000, grid=grid(3000)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/dg/cdglxs23pt7h2wls62uy6ewinjtjctcradtpw76t3qcrvnhso4kr.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_247 | |
triton_poi_fused_split_with_sizes_160 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_160(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3010560 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 23520 | |
x1 = (xindex // 23520) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (23520 + x0 + (94080*x1)), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_160.run(*args, 3010560, grid=grid(3010560), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_160.benchmark_all_configs(*args, 3010560, grid=grid(3010560)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/v4/cv4uf7ojdb6edtvnqbvbxjvi624yvygrldwzdl6uy2gdzz3sghqp.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_242 | |
triton_poi_fused__to_copy_161 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_161(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 5880 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_161.run(*args, 5880, grid=grid(5880), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_161.benchmark_all_configs(*args, 5880, grid=grid(5880)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/bn/cbnnjypiavc2wmccrce2li6mllqpp2n4naaetozpnboypgpizsqn.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_252 | |
triton_poi_fused_split_with_sizes_162 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_162(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3010560 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 23520 | |
x1 = (xindex // 23520) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (47040 + x0 + (94080*x1)), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_162.run(*args, 3010560, grid=grid(3010560), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_162.benchmark_all_configs(*args, 3010560, grid=grid(3010560)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/4m/c4mgwvuur32h35cj53cazh6flbatygb5baks32qmyo4zqc7drw5k.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_243 | |
triton_poi_fused__to_copy_163 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_163(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 9720 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_163.run(*args, 9720, grid=grid(9720), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_163.benchmark_all_configs(*args, 9720, grid=grid(9720)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/4m/c4mepa4uirogjqorspxpl4t5fx2rzmjdzatpwzpg64qrcyuvg7m4.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_257 | |
triton_poi_fused_split_with_sizes_164 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_164(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3010560 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 23520 | |
x1 = (xindex // 23520) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (70560 + x0 + (94080*x1)), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_164.run(*args, 3010560, grid=grid(3010560), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_164.benchmark_all_configs(*args, 3010560, grid=grid(3010560)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/qv/cqvo24surldsdqwccrhisp4chcnkg2otiq5qykrtymzijwmytnb7.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_26 | |
triton_poi_fused_cat_165 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_165(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3010560 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 23520 | |
x1 = (xindex // 23520) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (94080*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 120, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_165.run(*args, 3010560, grid=grid(3010560), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_165.benchmark_all_configs(*args, 3010560, grid=grid(3010560)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/25/c25wuycbavajx7kwcvwpcba6kxp6j5oofnjkbplrrxckdmxy3shz.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu | |
# aten._native_batch_norm_legit_functional => add_194, add_197, convert_element_type_244, convert_element_type_245, mul_296, mul_302, rsqrt_37, sub_37, var_mean_37 | |
# aten.mean => mean_9 | |
# aten.silu => convert_element_type_246, convert_element_type_247, mul_303, sigmoid_37 | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_166 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[65536, 256], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_mean_silu_166(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 61440 | |
rnumel = 196 | |
RBLOCK: tl.constexpr = 256 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r2 = rindex | |
x3 = xindex | |
x0 = xindex % 480 | |
tmp0 = tl.load(in_ptr0 + (r2 + (196*x3)), rmask, other=0).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x0), None) | |
tmp4 = tl.load(in_ptr2 + (x0), None) | |
tmp11 = tl.load(in_ptr3 + (x0), None) | |
tmp13 = tl.load(in_ptr4 + (x0), None) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 25088.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tmp16 = tmp15.to(tl.float32) | |
tmp17 = tl.sigmoid(tmp16) | |
tmp18 = tmp16 * tmp17 | |
tmp19 = tmp18.to(tl.float32) | |
tmp20 = tmp19.to(tl.float32) | |
tmp22 = tl.where(rmask, tmp20, 0) | |
tmp23 = tl.sum(tmp22, 1)[:, None] | |
tmp24 = 196.0 | |
tmp25 = tmp23 / tmp24 | |
tmp26 = tmp25.to(tl.float32) | |
tl.store(out_ptr0 + (r2 + (196*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask) | |
tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_6 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_166.run(*args, 61440, 196, grid=grid(61440), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_mean_silu_166.benchmark_all_configs(*args, 61440, 196, grid=grid(61440)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/vh/cvhn4ymz4o7zw5xdmhfhtnvjkm7lfirplpypnuvceox4nfhcy7wo.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_249 | |
triton_poi_fused__to_copy_167 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_167(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 38400 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_167.run(*args, 38400, grid=grid(38400), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_167.benchmark_all_configs(*args, 38400, grid=grid(38400)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/uc/cuc5yc2wu6mlfc426p45ysrpjdvcf67n4fnoghr4zijaa4qc4jub.py | |
# Original ATen: aten._to_copy, aten.convolution | |
# aten._to_copy => convert_element_type_248 | |
# aten.convolution => convolution_95 | |
triton_poi_fused__to_copy_convolution_168 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[128], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_convolution_168(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 80 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((80,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((80,), (1,), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_convolution_168.run(*args, 80, grid=grid(80), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_convolution_168.benchmark_all_configs(*args, 80, grid=grid(80)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/w4/cw4m6y7egnkzpotuw7yanoiegenydq7w76ehn3uqkz73dfsxrsvq.py | |
# Original ATen: aten._to_copy, aten.convolution, aten.silu | |
# aten._to_copy => convert_element_type_248 | |
# aten.convolution => convolution_95 | |
# aten.silu => convert_element_type_250, convert_element_type_251, mul_304, sigmoid_38 | |
triton_poi_fused__to_copy_convolution_silu_169 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_convolution_silu_169(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 10240 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 80 | |
tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32) | |
tmp2 = tmp0 + tmp1 | |
tmp3 = tmp2.to(tl.float32) | |
tmp4 = tl.sigmoid(tmp3) | |
tmp5 = tmp3 * tmp4 | |
tmp6 = tmp5.to(tl.float32) | |
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((80,), (1,), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_convolution_silu_169.run(*args, 10240, grid=grid(10240), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_convolution_silu_169.benchmark_all_configs(*args, 10240, grid=grid(10240)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/5w/c5wu2byqoqpdpilokyxdylwqvzl3vlutvi3qed6ouxeny7v3ma4k.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_253 | |
triton_poi_fused__to_copy_170 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_170(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 38400 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_170.run(*args, 38400, grid=grid(38400), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_170.benchmark_all_configs(*args, 38400, grid=grid(38400)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/t6/ct65ncuugfm3kp7u2z2xcftkozxdb7egjj3feed7rrbxlnc25mxn.py | |
# Original ATen: aten._to_copy, aten.convolution | |
# aten._to_copy => convert_element_type_252 | |
# aten.convolution => convolution_96 | |
triton_poi_fused__to_copy_convolution_171 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[512], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_convolution_171(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 480 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_convolution_171.run(*args, 480, grid=grid(480), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_convolution_171.benchmark_all_configs(*args, 480, grid=grid(480)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/q3/cq36nnpdlb4xwdvwsiktoah4r62ugwqlfeikgt4twhaobbsdqnku.py | |
# Original ATen: aten._to_copy, aten.convolution | |
# aten._to_copy => convert_element_type_252 | |
# aten.convolution => convolution_96 | |
triton_poi_fused__to_copy_convolution_172 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[65536], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_convolution_172(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 61440 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 480 | |
tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32) | |
tmp2 = tmp0 + tmp1 | |
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((480,), (1,), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_convolution_172.run(*args, 61440, grid=grid(61440), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_convolution_172.benchmark_all_configs(*args, 61440, grid=grid(61440)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ce/cce2z44226346eg4knvdzt24sfsujmrpjne4vcofj37miaugwnek.py | |
# Original ATen: aten.mul, aten.sigmoid, aten.silu | |
# aten.mul => mul_305 | |
# aten.sigmoid => sigmoid_39 | |
# aten.silu => convert_element_type_246, convert_element_type_247, mul_303, sigmoid_37 | |
triton_poi_fused_mul_sigmoid_silu_173 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_mul_sigmoid_silu_173(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 12042240 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x1 = (xindex // 196) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tmp6 = tl.sigmoid(tmp5) | |
tmp7 = tmp4 * tmp6 | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_mul_sigmoid_silu_173.run(*args, 12042240, grid=grid(12042240), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_mul_sigmoid_silu_173.benchmark_all_configs(*args, 12042240, grid=grid(12042240)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/jt/cjtarvyynr3tkfvm37qgtcwaksnjyb62otlxal5l5thrlfpz4yoz.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_260 | |
triton_poi_fused_split_with_sizes_174 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_174(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 6021120 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 47040 | |
x1 = (xindex // 47040) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (94080*x1)), None).to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_174.run(*args, 6021120, grid=grid(6021120), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_174.benchmark_all_configs(*args, 6021120, grid=grid(6021120)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/zb/czbuvo3mdbqgu4vhbh2rqqgxj7l4q2oef5prb5xafk4aclfa4wgv.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_261 | |
triton_poi_fused_split_with_sizes_175 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_175(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 6021120 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 47040 | |
x1 = (xindex // 47040) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (47040 + x0 + (94080*x1)), None).to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_175.run(*args, 6021120, grid=grid(6021120), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_175.benchmark_all_configs(*args, 6021120, grid=grid(6021120)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/fo/cfobovxlpvp6vyrqqsfdz6jlgrpfwpemctlxrrgsykw3zih3en3k.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_254 | |
triton_poi_fused__to_copy_176 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_176(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 19200 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_176.run(*args, 19200, grid=grid(19200), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_176.benchmark_all_configs(*args, 19200, grid=grid(19200)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/66/c66wd4retmi6azmfzfxmll5pmv3btijpcst56cykpuynmmhzhq23.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_27 | |
triton_poi_fused_cat_177 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_177(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2007040 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 15680 | |
x1 = (xindex // 15680) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (31360*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 80, 14, 14), (15680, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 80, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_177.run(*args, 2007040, grid=grid(2007040), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_177.benchmark_all_configs(*args, 2007040, grid=grid(2007040)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/4x/c4xzenlwjgchxhkagftgwbnakluheaf3fbwuqeozpqdbo7czlfp4.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add | |
# aten._native_batch_norm_legit_functional => add_199, add_202, convert_element_type_256, convert_element_type_257, mul_306, mul_312, rsqrt_38, sub_38, var_mean_38 | |
# aten.add => add_203 | |
triton_poi_fused__native_batch_norm_legit_functional_add_178 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_178(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 4014080 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 196) % 160 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x1), None) | |
tmp4 = tl.load(in_ptr2 + (x1), None) | |
tmp11 = tl.load(in_ptr3 + (x1), None) | |
tmp13 = tl.load(in_ptr4 + (x1), None) | |
tmp16 = tl.load(in_ptr5 + (x3), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 25088.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tmp17 = tmp15 + tmp16 | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp17, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((160,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_6 = rand_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_178.run(*args, 4014080, grid=grid(4014080), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_178.benchmark_all_configs(*args, 4014080, grid=grid(4014080)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/xt/cxt5mn4i2rohwchtz75fbvbykzhjdnjrf3qopbwsyyqyu5jqlzr6.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_306 | |
triton_poi_fused__to_copy_179 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_179(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 153600 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), None) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None) | |
def get_args(): | |
arg_0 = rand_strided((960, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((960, 160, 1, 1), (160, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_179.run(*args, 153600, grid=grid(153600), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_179.benchmark_all_configs(*args, 153600, grid=grid(153600)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/g5/cg527ugqaap3ndd26uaeekojscsqxex2xmpgnu55nr6zwyzt6pfr.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_237, add_238, add_239, convert_element_type_307, mul_364, mul_365, mul_366, mul_367, mul_368, rsqrt_45, squeeze_136, var_mean_45 | |
triton_red_fused__native_batch_norm_legit_functional_180 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[1024, 32768], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_180(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 960 | |
rnumel = 25088 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 196 | |
r2 = (rindex // 196) | |
tmp0 = tl.load(in_ptr0 + (r1 + (196*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
tmp2 = tl.sum(_tmp2, 1)[:, None] | |
tmp7 = tl.load(in_ptr1 + (x0), xmask) | |
tmp3 = 25088.0 | |
tmp4 = tmp2 / tmp3 | |
tmp5 = 0.1 | |
tmp6 = tmp4 * tmp5 | |
tmp8 = 0.9 | |
tmp9 = tmp7 * tmp8 | |
tmp10 = tmp6 + tmp9 | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask) | |
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 196 | |
r2 = (rindex // 196) | |
tmp11 = tl.load(in_ptr0 + (r1 + (196*x0) + (188160*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp12 = tmp11.to(tl.float32) | |
tmp13 = tmp12 - tmp4 | |
tmp14 = tmp13 * tmp13 | |
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15) | |
tmp15 = tl.sum(_tmp15, 1)[:, None] | |
tl.store(out_ptr1 + x0, tmp15, xmask) | |
tmp25 = tl.load(in_ptr2 + (x0), xmask) | |
tmp16 = 25088.0 | |
tmp17 = tmp15 / tmp16 | |
tmp18 = 1e-05 | |
tmp19 = tmp17 + tmp18 | |
tmp20 = tl.math.rsqrt(tmp19) | |
tmp21 = 1.0000398612827361 | |
tmp22 = tmp17 * tmp21 | |
tmp23 = 0.1 | |
tmp24 = tmp22 * tmp23 | |
tmp26 = 0.9 | |
tmp27 = tmp25 * tmp26 | |
tmp28 = tmp24 + tmp27 | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask) | |
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_180.run(*args, 960, 25088, grid=grid(960), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_180.benchmark_all_configs(*args, 960, 25088, grid=grid(960)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/dy/cdyo4fykv4n564s3r3eahdxt2kdz67oeovon5bt5bw2lnq6iiowx.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub | |
# aten._native_batch_norm_legit_functional => add_237, add_240, convert_element_type_307, convert_element_type_308, mul_363, mul_369, rsqrt_45, sub_45, var_mean_45 | |
# aten.add => add_322 | |
# aten.clone => clone_36 | |
# aten.fill => full_like_11 | |
# aten.mul => mul_627, mul_628 | |
# aten.sigmoid => sigmoid_75 | |
# aten.sub => sub_121 | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_181 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[33554432], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_181(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 24084480 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 196) % 960 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x1), None) | |
tmp4 = tl.load(in_ptr2 + (x1), None) | |
tmp11 = tl.load(in_ptr3 + (x1), None) | |
tmp13 = tl.load(in_ptr4 + (x1), None) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 25088.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tmp16 = tl.sigmoid(tmp15) | |
tmp17 = 1.0 | |
tmp18 = tmp17 - tmp16 | |
tmp19 = tmp15 * tmp18 | |
tmp20 = tmp19 + tmp17 | |
tmp21 = tmp16 * tmp20 | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None) | |
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp21, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_6 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_181.run(*args, 24084480, grid=grid(24084480), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_181.benchmark_all_configs(*args, 24084480, grid=grid(24084480)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/3n/c3n35ckjp4qebx5kiwvfxztqxmlk7uwm2vsaphijnyencaajb4gh.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_311 | |
triton_poi_fused__to_copy_182 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_182(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2160 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((240, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((240, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_182.run(*args, 2160, grid=grid(2160), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_182.benchmark_all_configs(*args, 2160, grid=grid(2160)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ow/cowqeeeuzgoqgwjjram7xczyaz465plzun33c7elszbys2k2zrzn.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_330 | |
triton_poi_fused_split_with_sizes_183 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_183(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 6021120 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 47040 | |
x1 = (xindex // 47040) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (188160*x1)), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_183.run(*args, 6021120, grid=grid(6021120), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_183.benchmark_all_configs(*args, 6021120, grid=grid(6021120)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/qx/cqxoadvgndjsqu4bltg3sq7l3fij2isfiihadwqnkb3ibmloxfyz.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_312 | |
triton_poi_fused__to_copy_184 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8192], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_184(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 6000 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((240, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((240, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_184.run(*args, 6000, grid=grid(6000), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_184.benchmark_all_configs(*args, 6000, grid=grid(6000)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/vk/cvkedu5ynhqgwwdblwjx7rst7rz42xs7fl3vaof4iorofkc5cxam.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_335 | |
triton_poi_fused_split_with_sizes_185 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_185(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 6021120 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 47040 | |
x1 = (xindex // 47040) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (47040 + x0 + (188160*x1)), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_185.run(*args, 6021120, grid=grid(6021120), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_185.benchmark_all_configs(*args, 6021120, grid=grid(6021120)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/nv/cnv3es7i7p4naxsxsc56c2w7slsaxha5zh6gcrrgizefpo7zps5s.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_313 | |
triton_poi_fused__to_copy_186 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_186(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 11760 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((240, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((240, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_186.run(*args, 11760, grid=grid(11760), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_186.benchmark_all_configs(*args, 11760, grid=grid(11760)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/3q/c3qkjjd4hxw6ylyfib2kl5k6u5atp2kcpplda3gzr6acmswhzitj.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_340 | |
triton_poi_fused_split_with_sizes_187 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_187(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 6021120 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 47040 | |
x1 = (xindex // 47040) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (94080 + x0 + (188160*x1)), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_187.run(*args, 6021120, grid=grid(6021120), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_187.benchmark_all_configs(*args, 6021120, grid=grid(6021120)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/l7/cl7v5hh5jck72nsc2u3t2ieraqxarpadxdpy5nt2hkbbqrgcinn7.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_314 | |
triton_poi_fused__to_copy_188 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_188(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 19440 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((240, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((240, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_188.run(*args, 19440, grid=grid(19440), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_188.benchmark_all_configs(*args, 19440, grid=grid(19440)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/xm/cxm7drxzaido67j4aqf4vqqzkyw7ffgx32ygj3ddi3kmzigoauhg.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_345 | |
triton_poi_fused_split_with_sizes_189 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_189(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 6021120 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 47040 | |
x1 = (xindex // 47040) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (141120 + x0 + (188160*x1)), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 240, 14, 14), (47040, 196, 14, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_189.run(*args, 6021120, grid=grid(6021120), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_189.benchmark_all_configs(*args, 6021120, grid=grid(6021120)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ny/cnyr36xeex2s2grdckurzoliwfqkdakrsvrmgx5fd6rzjncjxedb.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_34 | |
triton_poi_fused_cat_190 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_190(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1505280 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 11760 | |
x1 = (xindex // 11760) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (47040*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 240, 7, 7), (11760, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 240, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_190.run(*args, 1505280, grid=grid(1505280), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_190.benchmark_all_configs(*args, 1505280, grid=grid(1505280)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/z6/cz6ieqk3sfzltam3yeijfwdqrtao7w4zqdsqj32cpu6zoxg3irdq.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_242, add_243, add_244, convert_element_type_315, mul_372, mul_373, mul_374, mul_375, mul_376, rsqrt_46, squeeze_139, var_mean_46 | |
triton_red_fused__native_batch_norm_legit_functional_191 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[1024, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_191(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 960 | |
rnumel = 6272 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 49 | |
r2 = (rindex // 49) | |
tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (47040*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
tmp2 = tl.sum(_tmp2, 1)[:, None] | |
tmp7 = tl.load(in_ptr1 + (x0), xmask) | |
tmp3 = 6272.0 | |
tmp4 = tmp2 / tmp3 | |
tmp5 = 0.1 | |
tmp6 = tmp4 * tmp5 | |
tmp8 = 0.9 | |
tmp9 = tmp7 * tmp8 | |
tmp10 = tmp6 + tmp9 | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask) | |
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 49 | |
r2 = (rindex // 49) | |
tmp11 = tl.load(in_ptr0 + (r1 + (49*x0) + (47040*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp12 = tmp11.to(tl.float32) | |
tmp13 = tmp12 - tmp4 | |
tmp14 = tmp13 * tmp13 | |
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15) | |
tmp15 = tl.sum(_tmp15, 1)[:, None] | |
tl.store(out_ptr1 + x0, tmp15, xmask) | |
tmp25 = tl.load(in_ptr2 + (x0), xmask) | |
tmp16 = 6272.0 | |
tmp17 = tmp15 / tmp16 | |
tmp18 = 1e-05 | |
tmp19 = tmp17 + tmp18 | |
tmp20 = tl.math.rsqrt(tmp19) | |
tmp21 = 1.0001594642002871 | |
tmp22 = tmp17 * tmp21 | |
tmp23 = 0.1 | |
tmp24 = tmp22 * tmp23 | |
tmp26 = 0.9 | |
tmp27 = tmp25 * tmp26 | |
tmp28 = tmp24 + tmp27 | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask) | |
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_191.run(*args, 960, 6272, grid=grid(960), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_191.benchmark_all_configs(*args, 960, 6272, grid=grid(960)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/6y/c6ybh5xdfuwq6wdolxlmurgpxtgtn6q5shvsfwt5gf4rdpz7tas7.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu | |
# aten._native_batch_norm_legit_functional => add_242, add_245, convert_element_type_315, convert_element_type_316, mul_371, mul_377, rsqrt_46, sub_46, var_mean_46 | |
# aten.mean => mean_12 | |
# aten.silu => convert_element_type_317, convert_element_type_318, mul_378, sigmoid_49 | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_192 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[131072, 64], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_mean_silu_192(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 122880 | |
rnumel = 49 | |
RBLOCK: tl.constexpr = 64 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r2 = rindex | |
x3 = xindex | |
x0 = xindex % 960 | |
tmp0 = tl.load(in_ptr0 + (r2 + (49*x3)), rmask, other=0).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x0), None) | |
tmp4 = tl.load(in_ptr2 + (x0), None) | |
tmp11 = tl.load(in_ptr3 + (x0), None) | |
tmp13 = tl.load(in_ptr4 + (x0), None) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 6272.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tmp16 = tmp15.to(tl.float32) | |
tmp17 = tl.sigmoid(tmp16) | |
tmp18 = tmp16 * tmp17 | |
tmp19 = tmp18.to(tl.float32) | |
tmp20 = tmp19.to(tl.float32) | |
tmp22 = tl.where(rmask, tmp20, 0) | |
tmp23 = tl.sum(tmp22, 1)[:, None] | |
tmp24 = 49.0 | |
tmp25 = tmp23 / tmp24 | |
tmp26 = tmp25.to(tl.float32) | |
tl.store(out_ptr0 + (r2 + (49*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask) | |
tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_6 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_192.run(*args, 122880, 49, grid=grid(122880), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_mean_silu_192.benchmark_all_configs(*args, 122880, 49, grid=grid(122880)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/tl/ctlao5qdd2i3xcmliczx6rwjunhhywrgkmmolniswyndldbl73bm.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_320 | |
triton_poi_fused__to_copy_193 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_193(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 76800 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((80, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((80, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_193.run(*args, 76800, grid=grid(76800), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_193.benchmark_all_configs(*args, 76800, grid=grid(76800)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/b4/cb4kobddnmpxrqkwjrmqktjtkp5r7xvtzsia2m5pxlvyngvl3gez.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_324 | |
triton_poi_fused__to_copy_194 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_194(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 76800 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((960, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((960, 80, 1, 1), (80, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_194.run(*args, 76800, grid=grid(76800), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_194.benchmark_all_configs(*args, 76800, grid=grid(76800)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/5n/c5nkvlxqc6zdiwvvnuciiwj3rs2idxp5remuu4oe4qksftnrsipg.py | |
# Original ATen: aten._to_copy, aten.convolution | |
# aten._to_copy => convert_element_type_323 | |
# aten.convolution => convolution_125 | |
triton_poi_fused__to_copy_convolution_195 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_convolution_195(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 960 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_convolution_195.run(*args, 960, grid=grid(960), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_convolution_195.benchmark_all_configs(*args, 960, grid=grid(960)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/5g/c5gxudeqor4giltbrwmymcii3cqccfm32ibjrioytb2cth76irpn.py | |
# Original ATen: aten._to_copy, aten.convolution | |
# aten._to_copy => convert_element_type_323 | |
# aten.convolution => convolution_125 | |
triton_poi_fused__to_copy_convolution_196 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_convolution_196(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 122880 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 960 | |
tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32) | |
tmp2 = tmp0 + tmp1 | |
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((960,), (1,), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_convolution_196.run(*args, 122880, grid=grid(122880), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_convolution_196.benchmark_all_configs(*args, 122880, grid=grid(122880)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/u5/cu5blwuhanuznefsomv5iwcn6emdzdpn2ln3p27yycjvkiw6g2du.py | |
# Original ATen: aten.mul, aten.sigmoid, aten.silu | |
# aten.mul => mul_380 | |
# aten.sigmoid => sigmoid_51 | |
# aten.silu => convert_element_type_317, convert_element_type_318, mul_378, sigmoid_49 | |
triton_poi_fused_mul_sigmoid_silu_197 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_mul_sigmoid_silu_197(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 6021120 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x1 = (xindex // 49) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tmp6 = tl.sigmoid(tmp5) | |
tmp7 = tmp4 * tmp6 | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_mul_sigmoid_silu_197.run(*args, 6021120, grid=grid(6021120), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_mul_sigmoid_silu_197.benchmark_all_configs(*args, 6021120, grid=grid(6021120)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ow/cow7m27226bnhxi6lshzaavlwewllidrdyuy4wj4ablq5nvozskp.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_325 | |
triton_poi_fused__to_copy_198 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_198(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 253440 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((264, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((264, 960, 1, 1), (960, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_198.run(*args, 253440, grid=grid(253440), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_198.benchmark_all_configs(*args, 253440, grid=grid(253440)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/zi/czig3biynqg6sd4t6h2tmyqez5ybs7vrp4v72f35wteotcxxgft7.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_247, add_248, add_249, convert_element_type_326, mul_382, mul_383, mul_384, mul_385, mul_386, rsqrt_47, squeeze_142, var_mean_47 | |
triton_red_fused__native_batch_norm_legit_functional_199 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[512, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 9), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_199(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 264 | |
rnumel = 6272 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 49 | |
r2 = (rindex // 49) | |
tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
tmp2 = tl.sum(_tmp2, 1)[:, None] | |
tmp7 = tl.load(in_ptr1 + (x0), xmask) | |
tmp3 = 6272.0 | |
tmp4 = tmp2 / tmp3 | |
tmp5 = 0.1 | |
tmp6 = tmp4 * tmp5 | |
tmp8 = 0.9 | |
tmp9 = tmp7 * tmp8 | |
tmp10 = tmp6 + tmp9 | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask) | |
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 49 | |
r2 = (rindex // 49) | |
tmp11 = tl.load(in_ptr0 + (r1 + (49*x0) + (12936*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp12 = tmp11.to(tl.float32) | |
tmp13 = tmp12 - tmp4 | |
tmp14 = tmp13 * tmp13 | |
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15) | |
tmp15 = tl.sum(_tmp15, 1)[:, None] | |
tl.store(out_ptr1 + x0, tmp15, xmask) | |
tmp25 = tl.load(in_ptr2 + (x0), xmask) | |
tmp16 = 6272.0 | |
tmp17 = tmp15 / tmp16 | |
tmp18 = 1e-05 | |
tmp19 = tmp17 + tmp18 | |
tmp20 = tl.math.rsqrt(tmp19) | |
tmp21 = 1.0001594642002871 | |
tmp22 = tmp17 * tmp21 | |
tmp23 = 0.1 | |
tmp24 = tmp22 * tmp23 | |
tmp26 = 0.9 | |
tmp27 = tmp25 * tmp26 | |
tmp28 = tmp24 + tmp27 | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask) | |
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_199.run(*args, 264, 6272, grid=grid(264), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_199.benchmark_all_configs(*args, 264, 6272, grid=grid(264)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/rz/crzhphlanp4xljhlyhd5oa64x25m2selaazalafwkl2z37flecta.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_247, add_250, convert_element_type_326, convert_element_type_327, mul_381, mul_387, rsqrt_47, sub_47, var_mean_47 | |
triton_poi_fused__native_batch_norm_legit_functional_200 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_200(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1655808 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 49) % 264 | |
tmp0 = tl.load(in_ptr0 + (x3), xmask).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x1), xmask) | |
tmp4 = tl.load(in_ptr2 + (x1), xmask) | |
tmp11 = tl.load(in_ptr3 + (x1), xmask) | |
tmp13 = tl.load(in_ptr4 + (x1), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 6272.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_200.run(*args, 1655808, grid=grid(1655808), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_200.benchmark_all_configs(*args, 1655808, grid=grid(1655808)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/7o/c7ocvzt6vmjqamqpyhoskmyrik2ig24uhlssffiqbirrszrdop2r.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_328 | |
triton_poi_fused__to_copy_201 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[524288], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_201(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 418176 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1584, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((1584, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_201.run(*args, 418176, grid=grid(418176), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_201.benchmark_all_configs(*args, 418176, grid=grid(418176)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/uo/cuocaty2j6iusx5tak7blhjvcj7z7suv537pcyhwgk3qfuh7te7r.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_252, add_253, add_254, convert_element_type_329, mul_389, mul_390, mul_391, mul_392, mul_393, rsqrt_48, squeeze_145, var_mean_48 | |
triton_red_fused__native_batch_norm_legit_functional_202 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[2048, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_202(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 1584 | |
rnumel = 6272 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 49 | |
r2 = (rindex // 49) | |
tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
tmp2 = tl.sum(_tmp2, 1)[:, None] | |
tmp7 = tl.load(in_ptr1 + (x0), xmask) | |
tmp3 = 6272.0 | |
tmp4 = tmp2 / tmp3 | |
tmp5 = 0.1 | |
tmp6 = tmp4 * tmp5 | |
tmp8 = 0.9 | |
tmp9 = tmp7 * tmp8 | |
tmp10 = tmp6 + tmp9 | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask) | |
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 49 | |
r2 = (rindex // 49) | |
tmp11 = tl.load(in_ptr0 + (r1 + (49*x0) + (77616*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp12 = tmp11.to(tl.float32) | |
tmp13 = tmp12 - tmp4 | |
tmp14 = tmp13 * tmp13 | |
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15) | |
tmp15 = tl.sum(_tmp15, 1)[:, None] | |
tl.store(out_ptr1 + x0, tmp15, xmask) | |
tmp25 = tl.load(in_ptr2 + (x0), xmask) | |
tmp16 = 6272.0 | |
tmp17 = tmp15 / tmp16 | |
tmp18 = 1e-05 | |
tmp19 = tmp17 + tmp18 | |
tmp20 = tl.math.rsqrt(tmp19) | |
tmp21 = 1.0001594642002871 | |
tmp22 = tmp17 * tmp21 | |
tmp23 = 0.1 | |
tmp24 = tmp22 * tmp23 | |
tmp26 = 0.9 | |
tmp27 = tmp25 * tmp26 | |
tmp28 = tmp24 + tmp27 | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask) | |
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_202.run(*args, 1584, 6272, grid=grid(1584), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_202.benchmark_all_configs(*args, 1584, 6272, grid=grid(1584)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/d2/cd2goj7lmhtwh5scpm6a2hg5fjalovx2pxu4k6raurql7xnwmae5.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add, aten.clone, aten.fill, aten.mul, aten.sigmoid, aten.sub | |
# aten._native_batch_norm_legit_functional => add_252, add_255, convert_element_type_329, convert_element_type_330, mul_388, mul_394, rsqrt_48, sub_48, var_mean_48 | |
# aten.add => add_317 | |
# aten.clone => clone_39 | |
# aten.fill => full_like_8 | |
# aten.mul => mul_587, mul_588 | |
# aten.sigmoid => sigmoid_72 | |
# aten.sub => sub_105 | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_203 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_203(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 9934848 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 49) % 1584 | |
tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x1), None) | |
tmp4 = tl.load(in_ptr2 + (x1), None) | |
tmp11 = tl.load(in_ptr3 + (x1), None) | |
tmp13 = tl.load(in_ptr4 + (x1), None) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 6272.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tmp16 = tl.sigmoid(tmp15) | |
tmp17 = 1.0 | |
tmp18 = tmp17 - tmp16 | |
tmp19 = tmp15 * tmp18 | |
tmp20 = tmp19 + tmp17 | |
tmp21 = tmp16 * tmp20 | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp15, None) | |
tl.store(out_ptr1 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp21, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_6 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_203.run(*args, 9934848, grid=grid(9934848), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_203.benchmark_all_configs(*args, 9934848, grid=grid(9934848)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/4x/c4xvll4q3rptmkfezospulueatrj4cvvts5fa6rf327imnsbrund.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_333 | |
triton_poi_fused__to_copy_204 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4096], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_204(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 3564 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((396, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((396, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_204.run(*args, 3564, grid=grid(3564), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_204.benchmark_all_configs(*args, 3564, grid=grid(3564)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/oo/coohdf37vqteghs4tdccuqlheosk6dnovjd7xhliv6jnuxrt5q6k.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_356 | |
triton_poi_fused_split_with_sizes_205 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_205(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2483712 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 19404 | |
x1 = (xindex // 19404) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (77616*x1)), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_205.run(*args, 2483712, grid=grid(2483712), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_205.benchmark_all_configs(*args, 2483712, grid=grid(2483712)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/q2/cq2qbiau5hauewftu26n75rmwbfaqfklvxrkizh35mvxm2pcsrku.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_334 | |
triton_poi_fused__to_copy_206 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_206(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 9900 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((396, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((396, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_206.run(*args, 9900, grid=grid(9900), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_206.benchmark_all_configs(*args, 9900, grid=grid(9900)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/zt/cztezqwtd34ik4tsx6erhha7cnfeljbbbwidrvnml2yi26gx6sbr.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_361 | |
triton_poi_fused_split_with_sizes_207 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_207(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2483712 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 19404 | |
x1 = (xindex // 19404) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (19404 + x0 + (77616*x1)), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_207.run(*args, 2483712, grid=grid(2483712), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_207.benchmark_all_configs(*args, 2483712, grid=grid(2483712)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/a5/ca5stuyejdo4icolodntofflni5tgdpo5ewbossduq7phpjybihc.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_335 | |
triton_poi_fused__to_copy_208 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_208(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 19404 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((396, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((396, 1, 7, 7), (49, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_208.run(*args, 19404, grid=grid(19404), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_208.benchmark_all_configs(*args, 19404, grid=grid(19404)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/rz/crzmv3373ssmmao7rhii34dz6qyf2kesyjmask7sgbcayzblrpb7.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_366 | |
triton_poi_fused_split_with_sizes_209 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_209(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2483712 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 19404 | |
x1 = (xindex // 19404) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (38808 + x0 + (77616*x1)), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_209.run(*args, 2483712, grid=grid(2483712), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_209.benchmark_all_configs(*args, 2483712, grid=grid(2483712)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ua/cuahjxhhqgvs2pfawkys4ycn4cv26634v6swah3d6itp3ay5rix6.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_336 | |
triton_poi_fused__to_copy_210 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_210(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 32076 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((396, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((396, 1, 9, 9), (81, 81, 9, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_210.run(*args, 32076, grid=grid(32076), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_210.benchmark_all_configs(*args, 32076, grid=grid(32076)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/v7/cv7bjv2cw2xs5mbq2x7bddhtciqwe66f4efi6r3hekyvffle4el4.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_371 | |
triton_poi_fused_split_with_sizes_211 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_211(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2483712 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 19404 | |
x1 = (xindex // 19404) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (58212 + x0 + (77616*x1)), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp4, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_211.run(*args, 2483712, grid=grid(2483712), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_211.benchmark_all_configs(*args, 2483712, grid=grid(2483712)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/n2/cn2whdfczqvp7tj52zmht7aewohn4yzq7pmnqwutp7hyu2nidelg.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_35 | |
triton_poi_fused_cat_212 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_212(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2483712 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 19404 | |
x1 = (xindex // 19404) | |
tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (77616*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 396, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_212.run(*args, 2483712, grid=grid(2483712), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_212.benchmark_all_configs(*args, 2483712, grid=grid(2483712)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/rs/crsyf734jiflnmzza27lk4h5tk3zflzcv6g64yl5p4sitpmqg5lb.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_35 | |
triton_poi_fused_cat_213 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_213(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 2483712 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 19404 | |
x1 = (xindex // 19404) | |
tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (77616*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 396, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_213.run(*args, 2483712, grid=grid(2483712), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_213.benchmark_all_configs(*args, 2483712, grid=grid(2483712)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/5w/c5w3sqhxbgvomykhq2gsbk2tldu7xewurtz7sp2eii33b3skhwth.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.silu | |
# aten._native_batch_norm_legit_functional => add_257, add_260, convert_element_type_337, convert_element_type_338, mul_396, mul_402, rsqrt_49, sub_49, var_mean_49 | |
# aten.mean => mean_13 | |
# aten.silu => convert_element_type_339, convert_element_type_340, mul_403, sigmoid_53 | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_214 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[262144, 64], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_mean_silu_214(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 202752 | |
rnumel = 49 | |
RBLOCK: tl.constexpr = 64 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r2 = rindex | |
x3 = xindex | |
x0 = xindex % 1584 | |
tmp0 = tl.load(in_ptr0 + (r2 + (49*x3)), rmask, other=0).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x0), None) | |
tmp4 = tl.load(in_ptr2 + (x0), None) | |
tmp11 = tl.load(in_ptr3 + (x0), None) | |
tmp13 = tl.load(in_ptr4 + (x0), None) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 6272.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tmp16 = tmp15.to(tl.float32) | |
tmp17 = tl.sigmoid(tmp16) | |
tmp18 = tmp16 * tmp17 | |
tmp19 = tmp18.to(tl.float32) | |
tmp20 = tmp19.to(tl.float32) | |
tmp22 = tl.where(rmask, tmp20, 0) | |
tmp23 = tl.sum(tmp22, 1)[:, None] | |
tmp24 = 49.0 | |
tmp25 = tmp23 / tmp24 | |
tmp26 = tmp25.to(tl.float32) | |
tl.store(out_ptr0 + (r2 + (49*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp15, rmask) | |
tl.store(out_ptr2 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp26, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_6 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_214.run(*args, 202752, 49, grid=grid(202752), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_mean_silu_214.benchmark_all_configs(*args, 202752, 49, grid=grid(202752)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/73/c73f3yeyp4owv2uj77f6pdjw5heox2q6yo4kfkrqenvhw4yqhomk.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_342 | |
triton_poi_fused__to_copy_215 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_215(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 209088 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((132, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((132, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_215.run(*args, 209088, grid=grid(209088), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_215.benchmark_all_configs(*args, 209088, grid=grid(209088)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ca/ccagnt2eck6mlzrforr72k455qyt2o7bmlcxrdko6fbwlxsyixff.py | |
# Original ATen: aten._to_copy, aten.convolution | |
# aten._to_copy => convert_element_type_341 | |
# aten.convolution => convolution_132 | |
triton_poi_fused__to_copy_convolution_216 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[256], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_convolution_216(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 132 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((132,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((132,), (1,), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_convolution_216.run(*args, 132, grid=grid(132), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_convolution_216.benchmark_all_configs(*args, 132, grid=grid(132)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/es/ces42l67kcrpgvq2z54f4gf5my7z5wnxr2mun5sm744xvtl7qmr3.py | |
# Original ATen: aten._to_copy, aten.convolution, aten.silu | |
# aten._to_copy => convert_element_type_341 | |
# aten.convolution => convolution_132 | |
# aten.silu => convert_element_type_343, convert_element_type_344, mul_404, sigmoid_54 | |
triton_poi_fused__to_copy_convolution_silu_217 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[32768], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_convolution_silu_217(in_out_ptr0, in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 16896 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 132 | |
tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32) | |
tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
tmp2 = tmp0 + tmp1 | |
tmp3 = tmp2.to(tl.float32) | |
tmp4 = tl.sigmoid(tmp3) | |
tmp5 = tmp3 * tmp4 | |
tmp6 = tmp5.to(tl.float32) | |
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp6, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((132,), (1,), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_convolution_silu_217.run(*args, 16896, grid=grid(16896), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_convolution_silu_217.benchmark_all_configs(*args, 16896, grid=grid(16896)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/xv/cxvmi6ffjgu4jnqxjbleamncx3jw3a5x6bteybr2qp7fyy4to7ad.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_346 | |
triton_poi_fused__to_copy_218 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_218(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 209088 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1584, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((1584, 132, 1, 1), (132, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_218.run(*args, 209088, grid=grid(209088), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_218.benchmark_all_configs(*args, 209088, grid=grid(209088)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/xn/cxnwubuxft4fv4egkcmiwmgz7afk4w2atiiqg6oyapkchhvtktlp.py | |
# Original ATen: aten._to_copy, aten.convolution | |
# aten._to_copy => convert_element_type_345 | |
# aten.convolution => convolution_133 | |
triton_poi_fused__to_copy_convolution_219 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2048], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_convolution_219(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1584 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_convolution_219.run(*args, 1584, grid=grid(1584), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_convolution_219.benchmark_all_configs(*args, 1584, grid=grid(1584)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/zi/cziku3wkydhqpprjhx3spm364d6fncgicrpenyhycjc6uhdvng7k.py | |
# Original ATen: aten._to_copy, aten.convolution | |
# aten._to_copy => convert_element_type_345 | |
# aten.convolution => convolution_133 | |
triton_poi_fused__to_copy_convolution_220 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[262144], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_convolution_220(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 202752 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 1584 | |
tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32) | |
tmp1 = tl.load(in_ptr0 + (x0), None).to(tl.float32) | |
tmp2 = tmp0 + tmp1 | |
tl.store(in_out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp2, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1584,), (1,), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_convolution_220.run(*args, 202752, grid=grid(202752), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_convolution_220.benchmark_all_configs(*args, 202752, grid=grid(202752)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/qo/cqo4la4ss4kx6zb5vv53wk2l4cvgi273m7sogbmwaw3v62ktkznm.py | |
# Original ATen: aten.mul, aten.sigmoid, aten.silu | |
# aten.mul => mul_405 | |
# aten.sigmoid => sigmoid_55 | |
# aten.silu => convert_element_type_339, convert_element_type_340, mul_403, sigmoid_53 | |
triton_poi_fused_mul_sigmoid_silu_221 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[16777216], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_mul_sigmoid_silu_221(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 9934848 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x1 = (xindex // 49) | |
tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) | |
tmp5 = tl.load(in_ptr1 + (x1), None).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp2 = tl.sigmoid(tmp1) | |
tmp3 = tmp1 * tmp2 | |
tmp4 = tmp3.to(tl.float32) | |
tmp6 = tl.sigmoid(tmp5) | |
tmp7 = tmp4 * tmp6 | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp7, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_mul_sigmoid_silu_221.run(*args, 9934848, grid=grid(9934848), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_mul_sigmoid_silu_221.benchmark_all_configs(*args, 9934848, grid=grid(9934848)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/g3/cg37tw75k3qhu23v54dt4szux7ntig2ituyeprsnltkm3nfiarqp.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_374 | |
triton_poi_fused_split_with_sizes_222 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_222(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 4967424 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 38808 | |
x1 = (xindex // 38808) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0 + (77616*x1)), xmask).to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_222.run(*args, 4967424, grid=grid(4967424), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_222.benchmark_all_configs(*args, 4967424, grid=grid(4967424)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/qk/cqk2scd5kzvelb4wc76gqtuwya5pxfoqhhpq26ga2dggkjyf6t72.py | |
# Original ATen: aten.split_with_sizes | |
# aten.split_with_sizes => getitem_375 | |
triton_poi_fused_split_with_sizes_223 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[8388608], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_split_with_sizes_223(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 4967424 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex % 38808 | |
x1 = (xindex // 38808) | |
x2 = xindex | |
tmp0 = tl.load(in_ptr0 + (38808 + x0 + (77616*x1)), xmask).to(tl.float32) | |
tl.store(out_ptr0 + (x2 + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_split_with_sizes_223.run(*args, 4967424, grid=grid(4967424), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_split_with_sizes_223.benchmark_all_configs(*args, 4967424, grid=grid(4967424)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/5z/c5zsdbhynt6xdafkaydaa3qemqk6v4xjjheiwafzrfldvlj3ihqh.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_347 | |
triton_poi_fused__to_copy_224 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[131072], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_224(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 104544 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_224.run(*args, 104544, grid=grid(104544), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_224.benchmark_all_configs(*args, 104544, grid=grid(104544)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ca/ccakgu22ynocwwbf5iwb4um7n6tcg3nlbhmehr4k36wdy4nrskss.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_36 | |
triton_poi_fused_cat_225 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[1048576], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_225(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 827904 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 6468 | |
x1 = (xindex // 6468) | |
tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (12936*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 132, 7, 7), (6468, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 132, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_225.run(*args, 827904, grid=grid(827904), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_225.benchmark_all_configs(*args, 827904, grid=grid(827904)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/pp/cpprz7zopeigvjazcl7gems4pi2vh33qfi655qdlyuddxr7rvwr3.py | |
# Original ATen: aten.cat | |
# aten.cat => cat_36 | |
triton_poi_fused_cat_226 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[1048576], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_cat_226(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 827904 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x2 = xindex | |
x0 = xindex % 6468 | |
x1 = (xindex // 6468) | |
tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32) | |
tl.store(out_ptr0 + (x0 + (12936*x1) + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 132, 7, 7), (6468, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((128, 132, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_cat_226.run(*args, 827904, grid=grid(827904), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_cat_226.benchmark_all_configs(*args, 827904, grid=grid(827904)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/ws/cwsi7gb6vfnxgobitrcoooobibx3jugdjfpqbmhity4caskyzyyn.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.add | |
# aten._native_batch_norm_legit_functional => add_262, add_265, convert_element_type_349, convert_element_type_350, mul_406, mul_412, rsqrt_50, sub_50, var_mean_50 | |
# aten.add => add_266 | |
triton_poi_fused__native_batch_norm_legit_functional_add_227 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp16', 6: '*fp16', 7: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__native_batch_norm_legit_functional_add_227(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1655808 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x3 = xindex | |
x1 = (xindex // 49) % 264 | |
tmp0 = tl.load(in_ptr0 + (x3), xmask).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x1), xmask) | |
tmp4 = tl.load(in_ptr2 + (x1), xmask) | |
tmp11 = tl.load(in_ptr3 + (x1), xmask) | |
tmp13 = tl.load(in_ptr4 + (x1), xmask) | |
tmp16 = tl.load(in_ptr5 + (x3), xmask).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 6272.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tmp17 = tmp15 + tmp16 | |
tl.store(out_ptr0 + (x3 + tl.zeros([XBLOCK], tl.int32)), tmp17, xmask) | |
def get_args(): | |
arg_0 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((264,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_6 = rand_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__native_batch_norm_legit_functional_add_227.run(*args, 1655808, grid=grid(1655808), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__native_batch_norm_legit_functional_add_227.benchmark_all_configs(*args, 1655808, grid=grid(1655808)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/7p/c7pjks5p4ph6nsw372btix32wdljqepkkdc72dgxj6ycvs5rdkcd.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_397 | |
triton_poi_fused__to_copy_228 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[524288], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_228(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 405504 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), None) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None) | |
def get_args(): | |
arg_0 = rand_strided((1536, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((1536, 264, 1, 1), (264, 1, 1, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_228.run(*args, 405504, grid=grid(405504), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_228.benchmark_all_configs(*args, 405504, grid=grid(405504)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/om/com4odhqbzh6wdohnvxyqvr7ulmhf2d2ytjjg22jyjf67zj3ocbo.py | |
# Original ATen: aten._native_batch_norm_legit_functional | |
# aten._native_batch_norm_legit_functional => add_300, add_301, add_302, convert_element_type_398, mul_464, mul_465, mul_466, mul_467, mul_468, rsqrt_57, squeeze_172, var_mean_57 | |
triton_red_fused__native_batch_norm_legit_functional_229 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@reduction( | |
size_hints=[2048, 8192], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: 'i32', 9: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_red_fused__native_batch_norm_legit_functional_229(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): | |
xnumel = 1536 | |
rnumel = 6272 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rbase = tl.arange(0, RBLOCK)[None, :] | |
x0 = xindex | |
_tmp2 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 49 | |
r2 = (rindex // 49) | |
tmp0 = tl.load(in_ptr0 + (r1 + (49*x0) + (75264*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp1 = tmp0.to(tl.float32) | |
_tmp2 = tl.where(rmask & xmask, _tmp2 + tmp1, _tmp2) | |
tmp2 = tl.sum(_tmp2, 1)[:, None] | |
tmp7 = tl.load(in_ptr1 + (x0), xmask) | |
tmp3 = 6272.0 | |
tmp4 = tmp2 / tmp3 | |
tmp5 = 0.1 | |
tmp6 = tmp4 * tmp5 | |
tmp8 = 0.9 | |
tmp9 = tmp7 * tmp8 | |
tmp10 = tmp6 + tmp9 | |
tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp4, xmask) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp10, xmask) | |
_tmp15 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 | |
for roffset in range(0, rnumel, RBLOCK): | |
rindex = roffset + rbase | |
rmask = rindex < rnumel | |
r1 = rindex % 49 | |
r2 = (rindex // 49) | |
tmp11 = tl.load(in_ptr0 + (r1 + (49*x0) + (75264*r2)), rmask & xmask, eviction_policy='evict_last', other=0).to(tl.float32) | |
tmp12 = tmp11.to(tl.float32) | |
tmp13 = tmp12 - tmp4 | |
tmp14 = tmp13 * tmp13 | |
_tmp15 = tl.where(rmask & xmask, _tmp15 + tmp14, _tmp15) | |
tmp15 = tl.sum(_tmp15, 1)[:, None] | |
tl.store(out_ptr1 + x0, tmp15, xmask) | |
tmp25 = tl.load(in_ptr2 + (x0), xmask) | |
tmp16 = 6272.0 | |
tmp17 = tmp15 / tmp16 | |
tmp18 = 1e-05 | |
tmp19 = tmp17 + tmp18 | |
tmp20 = tl.math.rsqrt(tmp19) | |
tmp21 = 1.0001594642002871 | |
tmp22 = tmp17 * tmp21 | |
tmp23 = 0.1 | |
tmp24 = tmp22 * tmp23 | |
tmp26 = 0.9 | |
tmp27 = tmp25 * tmp26 | |
tmp28 = tmp24 + tmp27 | |
tl.store(out_ptr2 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask) | |
tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp28, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1, 1536, 1, 1), (1536, 1, 1536, 1536), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((1, 1536, 1, 1), (1536, 1, 1536, 1536), device='cuda:0', dtype=torch.float32) | |
arg_6 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_7 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_red_fused__native_batch_norm_legit_functional_229.run(*args, 1536, 6272, grid=grid(1536), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_red_fused__native_batch_norm_legit_functional_229.benchmark_all_configs(*args, 1536, 6272, grid=grid(1536)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=1) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/wa/cwaeb6f5uogsexjap772fuwlvl2vebz5e54hbg5l3px6d7azdjsm.py | |
# Original ATen: aten._native_batch_norm_legit_functional, aten.mean, aten.relu, aten.threshold_backward, aten.view | |
# aten._native_batch_norm_legit_functional => add_300, add_303, convert_element_type_398, convert_element_type_399, mul_463, mul_469, rsqrt_57, sub_57, var_mean_57 | |
# aten.mean => mean_16 | |
# aten.relu => relu_6 | |
# aten.threshold_backward => le | |
# aten.view => view | |
triton_per_fused__native_batch_norm_legit_functional_mean_relu_threshold_backward_view_230 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import persistent_reduction | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@persistent_reduction( | |
size_hints=[262144, 64], | |
reduction_hint=ReductionHint.INNER, | |
filename=__file__, | |
meta={'signature': {0: '*fp16', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*i1', 6: '*fp16', 7: 'i32', 8: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]} | |
) | |
@triton.jit | |
def triton_per_fused__native_batch_norm_legit_functional_mean_relu_threshold_backward_view_230(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr1, out_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr): | |
xnumel = 196608 | |
rnumel = 49 | |
RBLOCK: tl.constexpr = 64 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
xmask = xindex < xnumel | |
rindex = tl.arange(0, RBLOCK)[None, :] | |
rmask = rindex < rnumel | |
r2 = rindex | |
x3 = xindex | |
x0 = xindex % 1536 | |
tmp0 = tl.load(in_ptr0 + (r2 + (49*x3)), rmask, other=0).to(tl.float32) | |
tmp2 = tl.load(in_ptr1 + (x0), None) | |
tmp4 = tl.load(in_ptr2 + (x0), None) | |
tmp11 = tl.load(in_ptr3 + (x0), None) | |
tmp13 = tl.load(in_ptr4 + (x0), None) | |
tmp1 = tmp0.to(tl.float32) | |
tmp3 = tmp1 - tmp2 | |
tmp5 = 6272.0 | |
tmp6 = tmp4 / tmp5 | |
tmp7 = 1e-05 | |
tmp8 = tmp6 + tmp7 | |
tmp9 = tl.math.rsqrt(tmp8) | |
tmp10 = tmp3 * tmp9 | |
tmp12 = tmp10 * tmp11 | |
tmp14 = tmp12 + tmp13 | |
tmp15 = tmp14.to(tl.float32) | |
tmp16 = tl.where(0 != 0, 0, tl.where(0 > tmp15, 0, tmp15)) | |
tmp17 = 0.0 | |
tmp18 = tmp16 <= tmp17 | |
tmp19 = tmp16.to(tl.float32) | |
tmp21 = tl.where(rmask, tmp19, 0) | |
tmp22 = tl.sum(tmp21, 1)[:, None] | |
tmp23 = 49.0 | |
tmp24 = tmp22 / tmp23 | |
tmp25 = tmp24.to(tl.float32) | |
tl.store(out_ptr1 + (r2 + (49*x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp18, rmask) | |
tl.store(out_ptr3 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp25, None) | |
def get_args(): | |
arg_0 = rand_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda:0', dtype=torch.float16) | |
arg_1 = rand_strided((1, 1536, 1, 1), (1536, 1, 1536, 1536), device='cuda:0', dtype=torch.float32) | |
arg_2 = rand_strided((1, 1536, 1, 1), (1536, 1, 1536, 1536), device='cuda:0', dtype=torch.float32) | |
arg_3 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_4 = rand_strided((1536,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_5 = rand_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda:0', dtype=torch.bool) | |
arg_6 = rand_strided((128, 1536), (1536, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_per_fused__native_batch_norm_legit_functional_mean_relu_threshold_backward_view_230.run(*args, 196608, 49, grid=grid(196608), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_per_fused__native_batch_norm_legit_functional_mean_relu_threshold_backward_view_230.benchmark_all_configs(*args, 196608, 49, grid=grid(196608)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/hq/chqge2y77s52vahqshrpnxrmtqywhsxmxfiwmel7jsbqfdldligh.py | |
# Original ATen: aten._to_copy, aten.t | |
# aten._to_copy => convert_element_type_401 | |
# aten.t => permute_1 | |
triton_poi_fused__to_copy_t_231 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[2097152], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: '*fp16', 3: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_t_231(in_ptr0, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1536000 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), None) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None) | |
tl.store(out_ptr1 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, None) | |
def get_args(): | |
arg_0 = rand_strided((1000, 1536), (1536, 1), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((1000, 1536), (1536, 1), device='cuda:0', dtype=torch.float16) | |
arg_2 = rand_strided((1000, 1536), (1536, 1), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, arg_2, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_t_231.run(*args, 1536000, grid=grid(1536000), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_t_231.benchmark_all_configs(*args, 1536000, grid=grid(1536000)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/kk/ckkhmi7qwvpeybxqkogp5ah6kylzhjoxai5hqgfzedjta2pj3hyg.py | |
# Original ATen: aten._to_copy | |
# aten._to_copy => convert_element_type_400 | |
triton_poi_fused__to_copy_232 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[1024], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp16', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused__to_copy_232(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1000 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex | |
tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
tmp1 = tmp0.to(tl.float32) | |
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp1, xmask) | |
def get_args(): | |
arg_0 = rand_strided((1000,), (1,), device='cuda:0', dtype=torch.float32) | |
arg_1 = rand_strided((1000,), (1,), device='cuda:0', dtype=torch.float16) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_232.run(*args, 1000, grid=grid(1000), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused__to_copy_232.benchmark_all_configs(*args, 1000, grid=grid(1000)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
# kernel path: /tmp/torchinductor_shunting/gb/cgbszhl6qhqzdxycxrv6h7svhilojvsjjhnimvf6vwfj77spvald.py | |
# Original ATen: aten.add | |
# aten.add => add | |
triton_poi_fused_add_233 = async_compile.triton(''' | |
import triton | |
import triton.language as tl | |
from torch._inductor.ir import ReductionHint | |
from torch._inductor.ir import TileHint | |
from torch._inductor.triton_heuristics import pointwise | |
from torch._inductor.utils import instance_descriptor | |
from torch._dynamo.testing import rand_strided | |
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream | |
import torch | |
from torch._inductor.triton_heuristics import grid | |
@pointwise(size_hints=[1], filename=__file__, meta={'signature': {0: '*i64', 1: '*i64', 2: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())]}) | |
@triton.jit | |
def triton_poi_fused_add_233(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
xnumel = 1 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
tmp0 = tl.load(in_ptr0 + (0)) | |
tmp1 = tl.broadcast_to(tmp0, [XBLOCK]) | |
tmp2 = 1 | |
tmp3 = tmp1 + tmp2 | |
tl.store(out_ptr0 + (0 + tl.zeros([XBLOCK], tl.int32)), tmp3, None) | |
def get_args(): | |
arg_0 = rand_strided((), (), device='cuda:0', dtype=torch.int64) | |
arg_1 = rand_strided((), (), device='cuda:0', dtype=torch.int64) | |
return arg_0, arg_1, | |
def call(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused_add_233.run(*args, 1, grid=grid(1), stream=stream0) | |
def benchmark_all_configs(args): | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) | |
return triton_poi_fused_add_233.benchmark_all_configs(*args, 1, grid=grid(1)) | |
if __name__ == '__main__': | |
from torch._inductor.utils import get_num_bytes | |
from triton.testing import do_bench | |
args = get_args() | |
ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0] | |
num_gb = get_num_bytes(*args, num_in_out_args=0) / 1e9 | |
gb_per_s = num_gb / (ms / 1e3) | |
print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s") | |
''') | |
async_compile.wait(globals()) | |
del async_compile | |
def call(args): | |
primals_1, primals_2, primals_3, primals_4, primals_5, primals_6, primals_7, primals_8, primals_9, primals_10, primals_11, primals_12, primals_13, primals_14, primals_15, primals_16, primals_17, primals_18, primals_19, primals_20, primals_21, primals_22, primals_23, primals_24, primals_25, primals_26, primals_27, primals_28, primals_29, primals_30, primals_31, primals_32, primals_33, primals_34, primals_35, primals_36, primals_37, primals_38, primals_39, primals_40, primals_41, primals_42, primals_43, primals_44, primals_45, primals_46, primals_47, primals_48, primals_49, primals_50, primals_51, primals_52, primals_53, primals_54, primals_55, primals_56, primals_57, primals_58, primals_59, primals_60, primals_61, primals_62, primals_63, primals_64, primals_65, primals_66, primals_67, primals_68, primals_69, primals_70, primals_71, primals_72, primals_73, primals_74, primals_75, primals_76, primals_77, primals_78, primals_79, primals_80, primals_81, primals_82, primals_83, primals_84, primals_85, primals_86, primals_87, primals_88, primals_89, primals_90, primals_91, primals_92, primals_93, primals_94, primals_95, primals_96, primals_97, primals_98, primals_99, primals_100, primals_101, primals_102, primals_103, primals_104, primals_105, primals_106, primals_107, primals_108, primals_109, primals_110, primals_111, primals_112, primals_113, primals_114, primals_115, primals_116, primals_117, primals_118, primals_119, primals_120, primals_121, primals_122, primals_123, primals_124, primals_125, primals_126, primals_127, primals_128, primals_129, primals_130, primals_131, primals_132, primals_133, primals_134, primals_135, primals_136, primals_137, primals_138, primals_139, primals_140, primals_141, primals_142, primals_143, primals_144, primals_145, primals_146, primals_147, primals_148, primals_149, primals_150, primals_151, primals_152, primals_153, primals_154, primals_155, primals_156, primals_157, primals_158, primals_159, primals_160, primals_161, primals_162, primals_163, primals_164, primals_165, primals_166, primals_167, primals_168, primals_169, primals_170, primals_171, primals_172, primals_173, primals_174, primals_175, primals_176, primals_177, primals_178, primals_179, primals_180, primals_181, primals_182, primals_183, primals_184, primals_185, primals_186, primals_187, primals_188, primals_189, primals_190, primals_191, primals_192, primals_193, primals_194, primals_195, primals_196, primals_197, primals_198, primals_199, primals_200, primals_201, primals_202, primals_203, primals_204, primals_205, primals_206, primals_207, primals_208, primals_209, primals_210, primals_211, primals_212, primals_213, primals_214, primals_215, primals_216, primals_217, primals_218, primals_219, primals_220, primals_221, primals_222, primals_223, primals_224, primals_225, primals_226, primals_227, primals_228, primals_229, primals_230, primals_231, primals_232, primals_233, primals_234, primals_235, primals_236, primals_237, primals_238, primals_239, primals_240, primals_241, primals_242, primals_243, primals_244, primals_245, primals_246, primals_247, primals_248, primals_249, primals_250, primals_251, primals_252, primals_253, primals_254, primals_255, primals_256, primals_257, primals_258, primals_259, primals_260, primals_261, primals_262, primals_263, primals_264, primals_265, primals_266, primals_267, primals_268, primals_269, primals_270, primals_271, primals_272, primals_273, primals_274, primals_275, primals_276, primals_277, primals_278, primals_279, primals_280, primals_281, primals_282, primals_283, primals_284, primals_285, primals_286, primals_287, primals_288, primals_289, primals_290, primals_291, primals_292, primals_293, primals_294, primals_295, primals_296, primals_297, primals_298, primals_299, primals_300, primals_301, primals_302, primals_303, primals_304, primals_305, primals_306, primals_307, primals_308, primals_309, primals_310, primals_311, primals_312, primals_313, primals_314, primals_315, primals_316, primals_317, primals_318, primals_319, primals_320, primals_321, primals_322, primals_323, primals_324, primals_325, primals_326, primals_327, primals_328, primals_329, primals_330, primals_331, primals_332, primals_333, primals_334, primals_335, primals_336, primals_337, primals_338, primals_339, primals_340, primals_341, primals_342, primals_343, primals_344, primals_345, primals_346, primals_347, primals_348, primals_349, primals_350, primals_351, primals_352, primals_353, primals_354, primals_355, primals_356, primals_357, primals_358, primals_359, primals_360, primals_361, primals_362, primals_363, primals_364, primals_365, primals_366, primals_367, primals_368, primals_369, primals_370, primals_371, primals_372, primals_373, primals_374, primals_375, primals_376, primals_377, primals_378, primals_379, primals_380, primals_381, primals_382, primals_383, primals_384, primals_385, primals_386, primals_387, primals_388, primals_389, primals_390, primals_391, primals_392, primals_393, primals_394, primals_395, primals_396, primals_397, primals_398, primals_399, primals_400, primals_401, primals_402, primals_403, primals_404, primals_405, primals_406, primals_407, primals_408, primals_409, primals_410, primals_411, primals_412, primals_413, primals_414, primals_415, primals_416, primals_417, primals_418, primals_419, primals_420, primals_421, primals_422, primals_423, primals_424, primals_425, primals_426, primals_427, primals_428, primals_429, primals_430, primals_431, primals_432, primals_433, primals_434, primals_435, primals_436, primals_437, primals_438, primals_439, primals_440, primals_441, primals_442, primals_443, primals_444, primals_445, primals_446, primals_447, primals_448, primals_449, primals_450, primals_451, primals_452, primals_453, primals_454, primals_455, primals_456, primals_457, primals_458, primals_459, primals_460, primals_461, primals_462, primals_463, primals_464, primals_465, primals_466, primals_467, primals_468, primals_469, primals_470, primals_471, primals_472, primals_473, primals_474, primals_475, primals_476, primals_477, primals_478, primals_479, primals_480 = args | |
args.clear() | |
with torch.cuda._DeviceGuard(0): | |
torch.cuda.set_device(0) # no-op to ensure context | |
buf0 = empty_strided((32, 3, 3, 3), (27, 9, 3, 1), device='cuda', dtype=torch.float16) | |
stream0 = get_cuda_stream(0) | |
triton_poi_fused__to_copy_0.run(primals_117, buf0, 864, grid=grid(864), stream=stream0) | |
del primals_117 | |
buf1 = empty_strided((128, 3, 224, 224), (150528, 50176, 224, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_1.run(primals_480, buf1, 19267584, grid=grid(19267584), stream=stream0) | |
del primals_480 | |
buf2 = extern_kernels.convolution(buf1, buf0, stride=(2, 2), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf2, (128, 32, 112, 112), (401408, 12544, 112, 1)) | |
buf3 = empty_strided((1, 32, 1, 1, 14), (448, 1, 448, 448, 32), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_2.run(buf2, buf3, 448, 114688, grid=grid(448), stream=stream0) | |
buf4 = empty_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda', dtype=torch.float32) | |
buf5 = buf4; del buf4 # reuse | |
buf9 = empty_strided((32, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_3.run(buf5, buf3, primals_307, buf9, 32, 14, grid=grid(32), stream=stream0) | |
del primals_307 | |
buf6 = buf3; del buf3 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_4.run(buf2, buf5, buf6, 448, 114688, grid=grid(448), stream=stream0) | |
buf7 = empty_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda', dtype=torch.float32) | |
buf8 = empty_strided((32, ), (1, ), device='cuda', dtype=torch.float32) | |
buf10 = empty_strided((32, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_5.run(buf6, primals_308, buf7, buf8, buf10, 32, 14, grid=grid(32), stream=stream0) | |
del primals_308 | |
buf11 = empty_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_relu_6.run(buf2, buf5, buf7, primals_1, primals_2, buf11, 51380224, grid=grid(51380224), stream=stream0) | |
del primals_2 | |
buf12 = empty_strided((32, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_7.run(primals_118, buf12, 288, grid=grid(288), stream=stream0) | |
del primals_118 | |
buf13 = extern_kernels.convolution(buf11, buf12, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=32, bias=None) | |
assert_size_stride(buf13, (128, 32, 112, 112), (401408, 12544, 112, 1)) | |
buf14 = buf6; del buf6 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_2.run(buf13, buf14, 448, 114688, grid=grid(448), stream=stream0) | |
buf15 = buf7; del buf7 # reuse | |
buf16 = buf15; del buf15 # reuse | |
buf20 = empty_strided((32, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_3.run(buf16, buf14, primals_310, buf20, 32, 14, grid=grid(32), stream=stream0) | |
del primals_310 | |
buf17 = buf14; del buf14 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_4.run(buf13, buf16, buf17, 448, 114688, grid=grid(448), stream=stream0) | |
buf18 = empty_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda', dtype=torch.float32) | |
buf19 = empty_strided((32, ), (1, ), device='cuda', dtype=torch.float32) | |
buf21 = empty_strided((32, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_5.run(buf17, primals_311, buf18, buf19, buf21, 32, 14, grid=grid(32), stream=stream0) | |
del primals_311 | |
buf22 = empty_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_relu_6.run(buf13, buf16, buf18, primals_3, primals_4, buf22, 51380224, grid=grid(51380224), stream=stream0) | |
del primals_4 | |
buf23 = empty_strided((32, 32, 1, 1), (32, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_8.run(primals_119, buf23, 1024, grid=grid(1024), stream=stream0) | |
del primals_119 | |
buf24 = extern_kernels.convolution(buf22, buf23, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf24, (128, 32, 112, 112), (401408, 12544, 112, 1)) | |
buf25 = buf17; del buf17 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_2.run(buf24, buf25, 448, 114688, grid=grid(448), stream=stream0) | |
buf26 = buf18; del buf18 # reuse | |
buf27 = buf26; del buf26 # reuse | |
buf31 = empty_strided((32, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_3.run(buf27, buf25, primals_313, buf31, 32, 14, grid=grid(32), stream=stream0) | |
del primals_313 | |
buf28 = buf25; del buf25 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_4.run(buf24, buf27, buf28, 448, 114688, grid=grid(448), stream=stream0) | |
buf29 = empty_strided((1, 32, 1, 1), (32, 1, 32, 32), device='cuda', dtype=torch.float32) | |
buf30 = empty_strided((32, ), (1, ), device='cuda', dtype=torch.float32) | |
buf32 = empty_strided((32, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_5.run(buf28, primals_314, buf29, buf30, buf32, 32, 14, grid=grid(32), stream=stream0) | |
del buf28 | |
del primals_314 | |
buf33 = empty_strided((128, 32, 112, 112), (401408, 12544, 112, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_9.run(buf24, buf27, buf29, primals_5, primals_6, buf11, buf33, 51380224, grid=grid(51380224), stream=stream0) | |
del buf29 | |
del primals_6 | |
buf34 = empty_strided((128, 16, 112, 112), (200704, 12544, 112, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_split_with_sizes_10.run(buf33, buf34, 25690112, grid=grid(25690112), stream=stream0) | |
buf35 = empty_strided((128, 16, 112, 112), (200704, 12544, 112, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_split_with_sizes_11.run(buf33, buf35, 25690112, grid=grid(25690112), stream=stream0) | |
del buf33 | |
buf36 = empty_strided((96, 16, 1, 1), (16, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_12.run(primals_120, buf36, 1536, grid=grid(1536), stream=stream0) | |
del primals_120 | |
buf37 = extern_kernels.convolution(buf34, buf36, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf37, (128, 96, 112, 112), (1204224, 12544, 112, 1)) | |
buf38 = empty_strided((96, 16, 1, 1), (16, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_12.run(primals_121, buf38, 1536, grid=grid(1536), stream=stream0) | |
del primals_121 | |
buf39 = extern_kernels.convolution(buf35, buf38, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf39, (128, 96, 112, 112), (1204224, 12544, 112, 1)) | |
buf42 = empty_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda', dtype=torch.float16) | |
buf40 = as_strided(buf42, (128, 96, 112, 112), (2408448, 12544, 112, 1)) # alias | |
triton_poi_fused_cat_13.run(buf37, buf40, 154140672, grid=grid(154140672), stream=stream0) | |
del buf37 | |
buf41 = as_strided(buf42, (128, 96, 112, 112), (2408448, 12544, 112, 1), 1204224) # alias | |
triton_poi_fused_cat_13.run(buf39, buf41, 154140672, grid=grid(154140672), stream=stream0) | |
del buf39 | |
buf43 = empty_strided((1, 192, 1, 1, 13), (2496, 1, 2496, 2496, 192), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_14.run(buf42, buf43, 2496, 123511, grid=grid(2496), stream=stream0) | |
buf44 = empty_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda', dtype=torch.float32) | |
buf45 = buf44; del buf44 # reuse | |
buf49 = empty_strided((192, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_15.run(buf45, buf43, primals_316, buf49, 192, 13, grid=grid(192), stream=stream0) | |
del primals_316 | |
buf46 = buf43; del buf43 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_16.run(buf42, buf45, buf46, 2496, 123511, grid=grid(2496), stream=stream0) | |
buf47 = empty_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda', dtype=torch.float32) | |
buf48 = empty_strided((192, ), (1, ), device='cuda', dtype=torch.float32) | |
buf50 = empty_strided((192, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_17.run(buf46, primals_317, buf47, buf48, buf50, 192, 13, grid=grid(192), stream=stream0) | |
del buf46 | |
del primals_317 | |
buf51 = empty_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda', dtype=torch.float16) | |
buf1155 = empty_strided((128, 192, 112, 112), (2408448, 12544, 112, 1), device='cuda', dtype=torch.bool) | |
triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_18.run(buf42, buf45, buf47, primals_7, primals_8, buf51, buf1155, 308281344, grid=grid(308281344), stream=stream0) | |
del primals_8 | |
buf52 = empty_strided((64, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_19.run(primals_122, buf52, 576, grid=grid(576), stream=stream0) | |
del primals_122 | |
buf53 = extern_kernels.convolution(as_strided(buf51, (128, 64, 112, 112), (2408448, 12544, 112, 1)), buf52, stride=(2, 2), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=64, bias=None) | |
assert_size_stride(buf53, (128, 64, 56, 56), (200704, 3136, 56, 1)) | |
buf54 = empty_strided((64, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_20.run(primals_123, buf54, 1600, grid=grid(1600), stream=stream0) | |
del primals_123 | |
buf55 = extern_kernels.convolution(as_strided(buf51, (128, 64, 112, 112), (2408448, 12544, 112, 1), 802816), buf54, stride=(2, 2), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=64, bias=None) | |
assert_size_stride(buf55, (128, 64, 56, 56), (200704, 3136, 56, 1)) | |
buf56 = empty_strided((64, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_21.run(primals_124, buf56, 3136, grid=grid(3136), stream=stream0) | |
del primals_124 | |
buf57 = extern_kernels.convolution(as_strided(buf51, (128, 64, 112, 112), (2408448, 12544, 112, 1), 1605632), buf56, stride=(2, 2), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=64, bias=None) | |
assert_size_stride(buf57, (128, 64, 56, 56), (200704, 3136, 56, 1)) | |
buf61 = empty_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda', dtype=torch.float16) | |
buf58 = as_strided(buf61, (128, 64, 56, 56), (602112, 3136, 56, 1)) # alias | |
triton_poi_fused_cat_22.run(buf53, buf58, 25690112, grid=grid(25690112), stream=stream0) | |
del buf53 | |
buf59 = as_strided(buf61, (128, 64, 56, 56), (602112, 3136, 56, 1), 200704) # alias | |
triton_poi_fused_cat_22.run(buf55, buf59, 25690112, grid=grid(25690112), stream=stream0) | |
del buf55 | |
buf60 = as_strided(buf61, (128, 64, 56, 56), (602112, 3136, 56, 1), 401408) # alias | |
triton_poi_fused_cat_22.run(buf57, buf60, 25690112, grid=grid(25690112), stream=stream0) | |
del buf57 | |
buf62 = empty_strided((1, 192, 1, 1, 4), (768, 1, 768, 768, 192), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_23.run(buf61, buf62, 768, 100352, grid=grid(768), stream=stream0) | |
buf63 = buf47; del buf47 # reuse | |
buf64 = buf63; del buf63 # reuse | |
buf68 = empty_strided((192, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_24.run(buf64, buf62, primals_319, buf68, 192, 4, grid=grid(192), stream=stream0) | |
del primals_319 | |
buf65 = buf62; del buf62 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_25.run(buf61, buf64, buf65, 768, 100352, grid=grid(768), stream=stream0) | |
buf66 = empty_strided((1, 192, 1, 1), (192, 1, 192, 192), device='cuda', dtype=torch.float32) | |
buf67 = empty_strided((192, ), (1, ), device='cuda', dtype=torch.float32) | |
buf69 = empty_strided((192, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_26.run(buf65, primals_320, buf66, buf67, buf69, 192, 4, grid=grid(192), stream=stream0) | |
del buf65 | |
del primals_320 | |
buf70 = empty_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda', dtype=torch.float16) | |
buf1154 = empty_strided((128, 192, 56, 56), (602112, 3136, 56, 1), device='cuda', dtype=torch.bool) | |
triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_27.run(buf61, buf64, buf66, primals_9, primals_10, buf70, buf1154, 77070336, grid=grid(77070336), stream=stream0) | |
del buf66 | |
del primals_10 | |
buf71 = empty_strided((20, 96, 1, 1), (96, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_28.run(primals_125, buf71, 1920, grid=grid(1920), stream=stream0) | |
del primals_125 | |
buf72 = extern_kernels.convolution(as_strided(buf70, (128, 96, 56, 56), (602112, 3136, 56, 1)), buf71, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf72, (128, 20, 56, 56), (62720, 3136, 56, 1)) | |
buf73 = empty_strided((20, 96, 1, 1), (96, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_28.run(primals_126, buf73, 1920, grid=grid(1920), stream=stream0) | |
del primals_126 | |
buf74 = extern_kernels.convolution(as_strided(buf70, (128, 96, 56, 56), (602112, 3136, 56, 1), 301056), buf73, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf74, (128, 20, 56, 56), (62720, 3136, 56, 1)) | |
buf77 = empty_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda', dtype=torch.float16) | |
buf75 = as_strided(buf77, (128, 20, 56, 56), (125440, 3136, 56, 1)) # alias | |
triton_poi_fused_cat_29.run(buf72, buf75, 8028160, grid=grid(8028160), stream=stream0) | |
del buf72 | |
buf76 = as_strided(buf77, (128, 20, 56, 56), (125440, 3136, 56, 1), 62720) # alias | |
triton_poi_fused_cat_29.run(buf74, buf76, 8028160, grid=grid(8028160), stream=stream0) | |
del buf74 | |
buf78 = empty_strided((1, 40, 1, 1, 13), (520, 1, 520, 520, 40), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_30.run(buf77, buf78, 520, 30878, grid=grid(520), stream=stream0) | |
buf79 = empty_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda', dtype=torch.float32) | |
buf80 = buf79; del buf79 # reuse | |
buf84 = empty_strided((40, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_31.run(buf80, buf78, primals_322, buf84, 40, 13, grid=grid(40), stream=stream0) | |
del primals_322 | |
buf81 = buf78; del buf78 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_32.run(buf77, buf80, buf81, 520, 30878, grid=grid(520), stream=stream0) | |
buf82 = empty_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda', dtype=torch.float32) | |
buf83 = empty_strided((40, ), (1, ), device='cuda', dtype=torch.float32) | |
buf85 = empty_strided((40, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_33.run(buf81, primals_323, buf82, buf83, buf85, 40, 13, grid=grid(40), stream=stream0) | |
del primals_323 | |
buf86 = empty_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_34.run(buf77, buf80, buf82, primals_11, primals_12, buf86, 16056320, grid=grid(16056320), stream=stream0) | |
del primals_12 | |
buf87 = empty_strided((60, 20, 1, 1), (20, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_35.run(primals_127, buf87, 1200, grid=grid(1200), stream=stream0) | |
del primals_127 | |
buf88 = extern_kernels.convolution(as_strided(buf86, (128, 20, 56, 56), (125440, 3136, 56, 1)), buf87, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf88, (128, 60, 56, 56), (188160, 3136, 56, 1)) | |
buf89 = empty_strided((60, 20, 1, 1), (20, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_35.run(primals_128, buf89, 1200, grid=grid(1200), stream=stream0) | |
del primals_128 | |
buf90 = extern_kernels.convolution(as_strided(buf86, (128, 20, 56, 56), (125440, 3136, 56, 1), 62720), buf89, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf90, (128, 60, 56, 56), (188160, 3136, 56, 1)) | |
buf93 = empty_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda', dtype=torch.float16) | |
buf91 = as_strided(buf93, (128, 60, 56, 56), (376320, 3136, 56, 1)) # alias | |
triton_poi_fused_cat_36.run(buf88, buf91, 24084480, grid=grid(24084480), stream=stream0) | |
buf92 = as_strided(buf93, (128, 60, 56, 56), (376320, 3136, 56, 1), 188160) # alias | |
triton_poi_fused_cat_36.run(buf90, buf92, 24084480, grid=grid(24084480), stream=stream0) | |
buf94 = empty_strided((1, 120, 1, 1, 4), (480, 1, 480, 480, 120), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_37.run(buf93, buf94, 480, 100352, grid=grid(480), stream=stream0) | |
buf95 = empty_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda', dtype=torch.float32) | |
buf96 = buf95; del buf95 # reuse | |
buf100 = empty_strided((120, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_38.run(buf96, buf94, primals_325, buf100, 120, 4, grid=grid(120), stream=stream0) | |
del primals_325 | |
buf97 = buf94; del buf94 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_39.run(buf93, buf96, buf97, 480, 100352, grid=grid(480), stream=stream0) | |
buf98 = empty_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda', dtype=torch.float32) | |
buf99 = empty_strided((120, ), (1, ), device='cuda', dtype=torch.float32) | |
buf101 = empty_strided((120, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_40.run(buf97, primals_326, buf98, buf99, buf101, 120, 4, grid=grid(120), stream=stream0) | |
del primals_326 | |
buf102 = empty_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_relu_41.run(buf93, buf96, buf98, primals_13, primals_14, buf102, 48168960, grid=grid(48168960), stream=stream0) | |
del primals_14 | |
buf103 = empty_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_42.run(primals_129, buf103, 1080, grid=grid(1080), stream=stream0) | |
del primals_129 | |
buf104 = extern_kernels.convolution(buf102, buf103, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None) | |
assert_size_stride(buf104, (128, 120, 56, 56), (376320, 3136, 56, 1)) | |
buf105 = buf97; del buf97 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_37.run(buf104, buf105, 480, 100352, grid=grid(480), stream=stream0) | |
buf106 = buf98; del buf98 # reuse | |
buf107 = buf106; del buf106 # reuse | |
buf111 = empty_strided((120, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_38.run(buf107, buf105, primals_328, buf111, 120, 4, grid=grid(120), stream=stream0) | |
del primals_328 | |
buf108 = buf105; del buf105 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_39.run(buf104, buf107, buf108, 480, 100352, grid=grid(480), stream=stream0) | |
buf109 = empty_strided((1, 120, 1, 1), (120, 1, 120, 120), device='cuda', dtype=torch.float32) | |
buf110 = empty_strided((120, ), (1, ), device='cuda', dtype=torch.float32) | |
buf112 = empty_strided((120, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_40.run(buf108, primals_329, buf109, buf110, buf112, 120, 4, grid=grid(120), stream=stream0) | |
del primals_329 | |
buf113 = empty_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda', dtype=torch.float16) | |
buf1153 = empty_strided((128, 120, 56, 56), (376320, 3136, 56, 1), device='cuda', dtype=torch.bool) | |
triton_poi_fused__native_batch_norm_legit_functional_relu_threshold_backward_43.run(buf104, buf107, buf109, primals_15, primals_16, buf113, buf1153, 48168960, grid=grid(48168960), stream=stream0) | |
del buf109 | |
del primals_16 | |
buf114 = empty_strided((20, 60, 1, 1), (60, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_44.run(primals_130, buf114, 1200, grid=grid(1200), stream=stream0) | |
del primals_130 | |
buf115 = extern_kernels.convolution(as_strided(buf113, (128, 60, 56, 56), (376320, 3136, 56, 1)), buf114, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf115, (128, 20, 56, 56), (62720, 3136, 56, 1)) | |
buf116 = empty_strided((20, 60, 1, 1), (60, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_44.run(primals_131, buf116, 1200, grid=grid(1200), stream=stream0) | |
del primals_131 | |
buf117 = extern_kernels.convolution(as_strided(buf113, (128, 60, 56, 56), (376320, 3136, 56, 1), 188160), buf116, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf117, (128, 20, 56, 56), (62720, 3136, 56, 1)) | |
buf120 = empty_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda', dtype=torch.float16) | |
buf118 = as_strided(buf120, (128, 20, 56, 56), (125440, 3136, 56, 1)) # alias | |
triton_poi_fused_cat_29.run(buf115, buf118, 8028160, grid=grid(8028160), stream=stream0) | |
del buf115 | |
buf119 = as_strided(buf120, (128, 20, 56, 56), (125440, 3136, 56, 1), 62720) # alias | |
triton_poi_fused_cat_29.run(buf117, buf119, 8028160, grid=grid(8028160), stream=stream0) | |
del buf117 | |
buf121 = buf81; del buf81 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_30.run(buf120, buf121, 520, 30878, grid=grid(520), stream=stream0) | |
buf122 = buf82; del buf82 # reuse | |
buf123 = buf122; del buf122 # reuse | |
buf127 = empty_strided((40, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_31.run(buf123, buf121, primals_331, buf127, 40, 13, grid=grid(40), stream=stream0) | |
del primals_331 | |
buf124 = buf121; del buf121 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_32.run(buf120, buf123, buf124, 520, 30878, grid=grid(520), stream=stream0) | |
buf125 = empty_strided((1, 40, 1, 1), (40, 1, 40, 40), device='cuda', dtype=torch.float32) | |
buf126 = empty_strided((40, ), (1, ), device='cuda', dtype=torch.float32) | |
buf128 = empty_strided((40, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_33.run(buf124, primals_332, buf125, buf126, buf128, 40, 13, grid=grid(40), stream=stream0) | |
del buf124 | |
del primals_332 | |
buf129 = empty_strided((128, 40, 56, 56), (125440, 3136, 56, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_45.run(buf120, buf123, buf125, primals_17, primals_18, buf86, buf129, 16056320, grid=grid(16056320), stream=stream0) | |
del buf125 | |
del primals_18 | |
buf130 = empty_strided((240, 40, 1, 1), (40, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_46.run(primals_132, buf130, 9600, grid=grid(9600), stream=stream0) | |
del primals_132 | |
buf131 = extern_kernels.convolution(buf129, buf130, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf131, (128, 240, 56, 56), (752640, 3136, 56, 1)) | |
buf132 = empty_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda', dtype=torch.float32) | |
buf133 = buf132; del buf132 # reuse | |
buf136 = empty_strided((240, ), (1, ), device='cuda', dtype=torch.float32) | |
buf134 = empty_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda', dtype=torch.float32) | |
buf135 = empty_strided((240, ), (1, ), device='cuda', dtype=torch.float32) | |
buf137 = empty_strided((240, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_47.run(buf133, buf131, primals_334, primals_335, buf136, buf134, buf135, buf137, 240, 401408, grid=grid(240), stream=stream0) | |
del primals_334 | |
del primals_335 | |
buf138 = empty_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda', dtype=torch.float16) | |
buf1152 = empty_strided((128, 240, 56, 56), (752640, 3136, 56, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_48.run(buf131, buf133, buf134, primals_19, primals_20, buf138, buf1152, 96337920, grid=grid(96337920), stream=stream0) | |
del primals_20 | |
buf139 = empty_strided((60, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_49.run(primals_133, buf139, 540, grid=grid(540), stream=stream0) | |
del primals_133 | |
buf140 = buf90; del buf90 # reuse | |
triton_poi_fused_split_with_sizes_50.run(buf138, buf140, 24084480, grid=grid(24084480), stream=stream0) | |
buf141 = extern_kernels.convolution(buf140, buf139, stride=(2, 2), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=60, bias=None) | |
assert_size_stride(buf141, (128, 60, 28, 28), (47040, 784, 28, 1)) | |
buf142 = empty_strided((60, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_51.run(primals_134, buf142, 1500, grid=grid(1500), stream=stream0) | |
del primals_134 | |
buf143 = buf88; del buf88 # reuse | |
triton_poi_fused_split_with_sizes_52.run(buf138, buf143, 24084480, grid=grid(24084480), stream=stream0) | |
buf144 = extern_kernels.convolution(buf143, buf142, stride=(2, 2), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=60, bias=None) | |
assert_size_stride(buf144, (128, 60, 28, 28), (47040, 784, 28, 1)) | |
buf145 = empty_strided((60, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_53.run(primals_135, buf145, 2940, grid=grid(2940), stream=stream0) | |
del primals_135 | |
buf146 = empty_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_split_with_sizes_54.run(buf138, buf146, 24084480, grid=grid(24084480), stream=stream0) | |
buf147 = extern_kernels.convolution(buf146, buf145, stride=(2, 2), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=60, bias=None) | |
assert_size_stride(buf147, (128, 60, 28, 28), (47040, 784, 28, 1)) | |
buf148 = empty_strided((60, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_55.run(primals_136, buf148, 4860, grid=grid(4860), stream=stream0) | |
del primals_136 | |
buf149 = empty_strided((128, 60, 56, 56), (188160, 3136, 56, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_split_with_sizes_56.run(buf138, buf149, 24084480, grid=grid(24084480), stream=stream0) | |
del buf138 | |
buf150 = extern_kernels.convolution(buf149, buf148, stride=(2, 2), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=60, bias=None) | |
assert_size_stride(buf150, (128, 60, 28, 28), (47040, 784, 28, 1)) | |
buf155 = empty_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda', dtype=torch.float16) | |
buf151 = as_strided(buf155, (128, 60, 28, 28), (188160, 784, 28, 1)) # alias | |
triton_poi_fused_cat_57.run(buf141, buf151, 6021120, grid=grid(6021120), stream=stream0) | |
buf152 = as_strided(buf155, (128, 60, 28, 28), (188160, 784, 28, 1), 47040) # alias | |
triton_poi_fused_cat_57.run(buf144, buf152, 6021120, grid=grid(6021120), stream=stream0) | |
buf153 = as_strided(buf155, (128, 60, 28, 28), (188160, 784, 28, 1), 94080) # alias | |
triton_poi_fused_cat_57.run(buf147, buf153, 6021120, grid=grid(6021120), stream=stream0) | |
buf154 = as_strided(buf155, (128, 60, 28, 28), (188160, 784, 28, 1), 141120) # alias | |
triton_poi_fused_cat_57.run(buf150, buf154, 6021120, grid=grid(6021120), stream=stream0) | |
buf156 = buf134; del buf134 # reuse | |
buf157 = buf156; del buf156 # reuse | |
buf160 = empty_strided((240, ), (1, ), device='cuda', dtype=torch.float32) | |
buf158 = empty_strided((1, 240, 1, 1), (240, 1, 240, 240), device='cuda', dtype=torch.float32) | |
buf159 = empty_strided((240, ), (1, ), device='cuda', dtype=torch.float32) | |
buf161 = empty_strided((240, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_58.run(buf157, buf155, primals_337, primals_338, buf160, buf158, buf159, buf161, 240, 100352, grid=grid(240), stream=stream0) | |
del primals_337 | |
del primals_338 | |
buf162 = empty_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda', dtype=torch.float16) | |
buf164 = empty_strided((128, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_59.run(buf155, buf157, buf158, primals_21, primals_22, buf162, buf164, 30720, 784, grid=grid(30720), stream=stream0) | |
del buf158 | |
del primals_22 | |
buf165 = empty_strided((20, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_60.run(primals_137, buf165, 4800, grid=grid(4800), stream=stream0) | |
del primals_137 | |
buf166 = empty_strided((20, ), (1, ), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_convolution_61.run(primals_138, buf166, 20, grid=grid(20), stream=stream0) | |
del primals_138 | |
buf167 = extern_kernels.convolution(buf164, buf165, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf167, (128, 20, 1, 1), (20, 1, 1, 1)) | |
buf168 = buf167; del buf167 # reuse | |
buf169 = empty_strided((128, 20, 1, 1), (20, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_convolution_silu_62.run(buf168, buf166, buf169, 2560, grid=grid(2560), stream=stream0) | |
del buf166 | |
buf170 = empty_strided((240, 20, 1, 1), (20, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_63.run(primals_139, buf170, 4800, grid=grid(4800), stream=stream0) | |
del primals_139 | |
buf171 = empty_strided((240, ), (1, ), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_convolution_64.run(primals_140, buf171, 240, grid=grid(240), stream=stream0) | |
del primals_140 | |
buf172 = extern_kernels.convolution(buf169, buf170, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf172, (128, 240, 1, 1), (240, 1, 1, 1)) | |
buf173 = buf172; del buf172 # reuse | |
triton_poi_fused__to_copy_convolution_65.run(buf173, buf171, 30720, grid=grid(30720), stream=stream0) | |
del buf171 | |
buf174 = empty_strided((128, 240, 28, 28), (188160, 784, 28, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_mul_sigmoid_silu_66.run(buf162, buf173, buf174, 24084480, grid=grid(24084480), stream=stream0) | |
buf175 = empty_strided((56, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_67.run(primals_141, buf175, 13440, grid=grid(13440), stream=stream0) | |
del primals_141 | |
buf176 = extern_kernels.convolution(buf174, buf175, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf176, (128, 56, 28, 28), (43904, 784, 28, 1)) | |
buf177 = empty_strided((1, 56, 1, 1, 13), (728, 1, 728, 728, 56), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_68.run(buf176, buf177, 728, 7720, grid=grid(728), stream=stream0) | |
buf178 = empty_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda', dtype=torch.float32) | |
buf179 = buf178; del buf178 # reuse | |
buf183 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_69.run(buf179, buf177, primals_340, buf183, 56, 13, grid=grid(56), stream=stream0) | |
del primals_340 | |
buf180 = buf177; del buf177 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_70.run(buf176, buf179, buf180, 728, 7720, grid=grid(728), stream=stream0) | |
buf181 = empty_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda', dtype=torch.float32) | |
buf182 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32) | |
buf184 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_71.run(buf180, primals_341, buf181, buf182, buf184, 56, 13, grid=grid(56), stream=stream0) | |
del primals_341 | |
buf185 = empty_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_72.run(buf176, buf179, buf181, primals_23, primals_24, buf185, 5619712, grid=grid(5619712), stream=stream0) | |
del primals_24 | |
buf186 = empty_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_73.run(primals_142, buf186, 4704, grid=grid(4704), stream=stream0) | |
del primals_142 | |
buf187 = extern_kernels.convolution(as_strided(buf185, (128, 28, 28, 28), (43904, 784, 28, 1)), buf186, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf187, (128, 168, 28, 28), (131712, 784, 28, 1)) | |
buf188 = empty_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_73.run(primals_143, buf188, 4704, grid=grid(4704), stream=stream0) | |
del primals_143 | |
buf189 = extern_kernels.convolution(as_strided(buf185, (128, 28, 28, 28), (43904, 784, 28, 1), 21952), buf188, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf189, (128, 168, 28, 28), (131712, 784, 28, 1)) | |
buf192 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16) | |
buf190 = as_strided(buf192, (128, 168, 28, 28), (263424, 784, 28, 1)) # alias | |
triton_poi_fused_cat_74.run(buf187, buf190, 16859136, grid=grid(16859136), stream=stream0) | |
buf191 = as_strided(buf192, (128, 168, 28, 28), (263424, 784, 28, 1), 131712) # alias | |
triton_poi_fused_cat_74.run(buf189, buf191, 16859136, grid=grid(16859136), stream=stream0) | |
buf193 = empty_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda', dtype=torch.float32) | |
buf194 = buf193; del buf193 # reuse | |
buf197 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32) | |
buf195 = empty_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda', dtype=torch.float32) | |
buf196 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32) | |
buf198 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_75.run(buf194, buf192, primals_343, primals_344, buf197, buf195, buf196, buf198, 336, 100352, grid=grid(336), stream=stream0) | |
del primals_343 | |
del primals_344 | |
buf199 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16) | |
buf1150 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76.run(buf192, buf194, buf195, primals_25, primals_26, buf199, buf1150, 33718272, grid=grid(33718272), stream=stream0) | |
del primals_26 | |
buf200 = empty_strided((168, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_77.run(primals_144, buf200, 1512, grid=grid(1512), stream=stream0) | |
del primals_144 | |
buf201 = buf189; del buf189 # reuse | |
triton_poi_fused_split_with_sizes_78.run(buf199, buf201, 16859136, grid=grid(16859136), stream=stream0) | |
buf202 = extern_kernels.convolution(buf201, buf200, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=168, bias=None) | |
assert_size_stride(buf202, (128, 168, 28, 28), (131712, 784, 28, 1)) | |
buf203 = empty_strided((168, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_79.run(primals_145, buf203, 4200, grid=grid(4200), stream=stream0) | |
del primals_145 | |
buf204 = buf187; del buf187 # reuse | |
triton_poi_fused_split_with_sizes_80.run(buf199, buf204, 16859136, grid=grid(16859136), stream=stream0) | |
buf205 = extern_kernels.convolution(buf204, buf203, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=168, bias=None) | |
assert_size_stride(buf205, (128, 168, 28, 28), (131712, 784, 28, 1)) | |
buf208 = buf199; del buf199 # reuse | |
buf206 = as_strided(buf208, (128, 168, 28, 28), (263424, 784, 28, 1)) # alias | |
triton_poi_fused_cat_74.run(buf202, buf206, 16859136, grid=grid(16859136), stream=stream0) | |
buf207 = as_strided(buf208, (128, 168, 28, 28), (263424, 784, 28, 1), 131712) # alias | |
triton_poi_fused_cat_74.run(buf205, buf207, 16859136, grid=grid(16859136), stream=stream0) | |
buf209 = buf195; del buf195 # reuse | |
buf210 = buf209; del buf209 # reuse | |
buf213 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32) | |
buf211 = empty_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda', dtype=torch.float32) | |
buf212 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32) | |
buf214 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_75.run(buf210, buf208, primals_346, primals_347, buf213, buf211, buf212, buf214, 336, 100352, grid=grid(336), stream=stream0) | |
del primals_346 | |
del primals_347 | |
buf215 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16) | |
buf217 = empty_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_81.run(buf208, buf210, buf211, primals_27, primals_28, buf215, buf217, 43008, 784, grid=grid(43008), stream=stream0) | |
del primals_28 | |
buf218 = empty_strided((28, 336, 1, 1), (336, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_82.run(primals_146, buf218, 9408, grid=grid(9408), stream=stream0) | |
del primals_146 | |
buf219 = empty_strided((28, ), (1, ), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_convolution_83.run(primals_147, buf219, 28, grid=grid(28), stream=stream0) | |
del primals_147 | |
buf220 = extern_kernels.convolution(buf217, buf218, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf220, (128, 28, 1, 1), (28, 1, 1, 1)) | |
buf221 = buf220; del buf220 # reuse | |
buf222 = empty_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_convolution_silu_84.run(buf221, buf219, buf222, 3584, grid=grid(3584), stream=stream0) | |
buf223 = empty_strided((336, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_85.run(primals_148, buf223, 9408, grid=grid(9408), stream=stream0) | |
del primals_148 | |
buf224 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_convolution_86.run(primals_149, buf224, 336, grid=grid(336), stream=stream0) | |
del primals_149 | |
buf225 = extern_kernels.convolution(buf222, buf223, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf225, (128, 336, 1, 1), (336, 1, 1, 1)) | |
buf226 = buf225; del buf225 # reuse | |
triton_poi_fused__to_copy_convolution_87.run(buf226, buf224, 43008, grid=grid(43008), stream=stream0) | |
buf227 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_mul_sigmoid_silu_88.run(buf215, buf226, buf227, 33718272, grid=grid(33718272), stream=stream0) | |
buf228 = buf205; del buf205 # reuse | |
triton_poi_fused_split_with_sizes_89.run(buf227, buf228, 16859136, grid=grid(16859136), stream=stream0) | |
buf229 = buf202; del buf202 # reuse | |
triton_poi_fused_split_with_sizes_90.run(buf227, buf229, 16859136, grid=grid(16859136), stream=stream0) | |
buf230 = empty_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_91.run(primals_150, buf230, 4704, grid=grid(4704), stream=stream0) | |
del primals_150 | |
buf231 = extern_kernels.convolution(buf228, buf230, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf231, (128, 28, 28, 28), (21952, 784, 28, 1)) | |
buf232 = empty_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_91.run(primals_151, buf232, 4704, grid=grid(4704), stream=stream0) | |
del primals_151 | |
buf233 = extern_kernels.convolution(buf229, buf232, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf233, (128, 28, 28, 28), (21952, 784, 28, 1)) | |
buf236 = empty_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda', dtype=torch.float16) | |
buf234 = as_strided(buf236, (128, 28, 28, 28), (43904, 784, 28, 1)) # alias | |
triton_poi_fused_cat_92.run(buf231, buf234, 2809856, grid=grid(2809856), stream=stream0) | |
del buf231 | |
buf235 = as_strided(buf236, (128, 28, 28, 28), (43904, 784, 28, 1), 21952) # alias | |
triton_poi_fused_cat_92.run(buf233, buf235, 2809856, grid=grid(2809856), stream=stream0) | |
del buf233 | |
buf237 = buf180; del buf180 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_68.run(buf236, buf237, 728, 7720, grid=grid(728), stream=stream0) | |
buf238 = buf181; del buf181 # reuse | |
buf239 = buf238; del buf238 # reuse | |
buf243 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_69.run(buf239, buf237, primals_349, buf243, 56, 13, grid=grid(56), stream=stream0) | |
del primals_349 | |
buf240 = buf237; del buf237 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_70.run(buf236, buf239, buf240, 728, 7720, grid=grid(728), stream=stream0) | |
buf241 = empty_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda', dtype=torch.float32) | |
buf242 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32) | |
buf244 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_71.run(buf240, primals_350, buf241, buf242, buf244, 56, 13, grid=grid(56), stream=stream0) | |
del primals_350 | |
buf245 = empty_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_93.run(buf236, buf239, buf241, primals_29, primals_30, buf185, buf245, 5619712, grid=grid(5619712), stream=stream0) | |
del primals_30 | |
buf246 = empty_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_73.run(primals_152, buf246, 4704, grid=grid(4704), stream=stream0) | |
del primals_152 | |
buf247 = extern_kernels.convolution(as_strided(buf245, (128, 28, 28, 28), (43904, 784, 28, 1)), buf246, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf247, (128, 168, 28, 28), (131712, 784, 28, 1)) | |
buf248 = empty_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_73.run(primals_153, buf248, 4704, grid=grid(4704), stream=stream0) | |
del primals_153 | |
buf249 = extern_kernels.convolution(as_strided(buf245, (128, 28, 28, 28), (43904, 784, 28, 1), 21952), buf248, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf249, (128, 168, 28, 28), (131712, 784, 28, 1)) | |
buf252 = buf227; del buf227 # reuse | |
buf250 = as_strided(buf252, (128, 168, 28, 28), (263424, 784, 28, 1)) # alias | |
triton_poi_fused_cat_74.run(buf247, buf250, 16859136, grid=grid(16859136), stream=stream0) | |
buf251 = as_strided(buf252, (128, 168, 28, 28), (263424, 784, 28, 1), 131712) # alias | |
triton_poi_fused_cat_74.run(buf249, buf251, 16859136, grid=grid(16859136), stream=stream0) | |
buf253 = buf211; del buf211 # reuse | |
buf254 = buf253; del buf253 # reuse | |
buf257 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32) | |
buf255 = empty_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda', dtype=torch.float32) | |
buf256 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32) | |
buf258 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_75.run(buf254, buf252, primals_352, primals_353, buf257, buf255, buf256, buf258, 336, 100352, grid=grid(336), stream=stream0) | |
del primals_352 | |
del primals_353 | |
buf259 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16) | |
buf1148 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76.run(buf252, buf254, buf255, primals_31, primals_32, buf259, buf1148, 33718272, grid=grid(33718272), stream=stream0) | |
del primals_32 | |
buf260 = empty_strided((168, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_77.run(primals_154, buf260, 1512, grid=grid(1512), stream=stream0) | |
del primals_154 | |
buf261 = buf249; del buf249 # reuse | |
triton_poi_fused_split_with_sizes_78.run(buf259, buf261, 16859136, grid=grid(16859136), stream=stream0) | |
buf262 = extern_kernels.convolution(buf261, buf260, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=168, bias=None) | |
assert_size_stride(buf262, (128, 168, 28, 28), (131712, 784, 28, 1)) | |
buf263 = empty_strided((168, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_79.run(primals_155, buf263, 4200, grid=grid(4200), stream=stream0) | |
del primals_155 | |
buf264 = buf247; del buf247 # reuse | |
triton_poi_fused_split_with_sizes_80.run(buf259, buf264, 16859136, grid=grid(16859136), stream=stream0) | |
buf265 = extern_kernels.convolution(buf264, buf263, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=168, bias=None) | |
assert_size_stride(buf265, (128, 168, 28, 28), (131712, 784, 28, 1)) | |
buf268 = buf259; del buf259 # reuse | |
buf266 = as_strided(buf268, (128, 168, 28, 28), (263424, 784, 28, 1)) # alias | |
triton_poi_fused_cat_74.run(buf262, buf266, 16859136, grid=grid(16859136), stream=stream0) | |
buf267 = as_strided(buf268, (128, 168, 28, 28), (263424, 784, 28, 1), 131712) # alias | |
triton_poi_fused_cat_74.run(buf265, buf267, 16859136, grid=grid(16859136), stream=stream0) | |
buf269 = buf255; del buf255 # reuse | |
buf270 = buf269; del buf269 # reuse | |
buf273 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32) | |
buf271 = empty_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda', dtype=torch.float32) | |
buf272 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32) | |
buf274 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_75.run(buf270, buf268, primals_355, primals_356, buf273, buf271, buf272, buf274, 336, 100352, grid=grid(336), stream=stream0) | |
del primals_355 | |
del primals_356 | |
buf275 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16) | |
buf277 = empty_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_81.run(buf268, buf270, buf271, primals_33, primals_34, buf275, buf277, 43008, 784, grid=grid(43008), stream=stream0) | |
del primals_34 | |
buf278 = empty_strided((28, 336, 1, 1), (336, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_82.run(primals_156, buf278, 9408, grid=grid(9408), stream=stream0) | |
del primals_156 | |
buf279 = buf219; del buf219 # reuse | |
triton_poi_fused__to_copy_convolution_83.run(primals_157, buf279, 28, grid=grid(28), stream=stream0) | |
del primals_157 | |
buf280 = extern_kernels.convolution(buf277, buf278, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf280, (128, 28, 1, 1), (28, 1, 1, 1)) | |
buf281 = buf280; del buf280 # reuse | |
buf282 = empty_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_convolution_silu_84.run(buf281, buf279, buf282, 3584, grid=grid(3584), stream=stream0) | |
buf283 = empty_strided((336, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_85.run(primals_158, buf283, 9408, grid=grid(9408), stream=stream0) | |
del primals_158 | |
buf284 = buf224; del buf224 # reuse | |
triton_poi_fused__to_copy_convolution_86.run(primals_159, buf284, 336, grid=grid(336), stream=stream0) | |
del primals_159 | |
buf285 = extern_kernels.convolution(buf282, buf283, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf285, (128, 336, 1, 1), (336, 1, 1, 1)) | |
buf286 = buf285; del buf285 # reuse | |
triton_poi_fused__to_copy_convolution_87.run(buf286, buf284, 43008, grid=grid(43008), stream=stream0) | |
buf287 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_mul_sigmoid_silu_88.run(buf275, buf286, buf287, 33718272, grid=grid(33718272), stream=stream0) | |
buf288 = buf265; del buf265 # reuse | |
triton_poi_fused_split_with_sizes_89.run(buf287, buf288, 16859136, grid=grid(16859136), stream=stream0) | |
buf289 = buf262; del buf262 # reuse | |
triton_poi_fused_split_with_sizes_90.run(buf287, buf289, 16859136, grid=grid(16859136), stream=stream0) | |
buf290 = empty_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_91.run(primals_160, buf290, 4704, grid=grid(4704), stream=stream0) | |
del primals_160 | |
buf291 = extern_kernels.convolution(buf288, buf290, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf291, (128, 28, 28, 28), (21952, 784, 28, 1)) | |
buf292 = empty_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_91.run(primals_161, buf292, 4704, grid=grid(4704), stream=stream0) | |
del primals_161 | |
buf293 = extern_kernels.convolution(buf289, buf292, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf293, (128, 28, 28, 28), (21952, 784, 28, 1)) | |
buf296 = empty_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda', dtype=torch.float16) | |
buf294 = as_strided(buf296, (128, 28, 28, 28), (43904, 784, 28, 1)) # alias | |
triton_poi_fused_cat_92.run(buf291, buf294, 2809856, grid=grid(2809856), stream=stream0) | |
del buf291 | |
buf295 = as_strided(buf296, (128, 28, 28, 28), (43904, 784, 28, 1), 21952) # alias | |
triton_poi_fused_cat_92.run(buf293, buf295, 2809856, grid=grid(2809856), stream=stream0) | |
del buf293 | |
buf297 = buf240; del buf240 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_68.run(buf296, buf297, 728, 7720, grid=grid(728), stream=stream0) | |
buf298 = buf241; del buf241 # reuse | |
buf299 = buf298; del buf298 # reuse | |
buf303 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_69.run(buf299, buf297, primals_358, buf303, 56, 13, grid=grid(56), stream=stream0) | |
del primals_358 | |
buf300 = buf297; del buf297 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_70.run(buf296, buf299, buf300, 728, 7720, grid=grid(728), stream=stream0) | |
buf301 = empty_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda', dtype=torch.float32) | |
buf302 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32) | |
buf304 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_71.run(buf300, primals_359, buf301, buf302, buf304, 56, 13, grid=grid(56), stream=stream0) | |
del primals_359 | |
buf305 = empty_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_93.run(buf296, buf299, buf301, primals_35, primals_36, buf245, buf305, 5619712, grid=grid(5619712), stream=stream0) | |
del primals_36 | |
buf306 = empty_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_73.run(primals_162, buf306, 4704, grid=grid(4704), stream=stream0) | |
del primals_162 | |
buf307 = extern_kernels.convolution(as_strided(buf305, (128, 28, 28, 28), (43904, 784, 28, 1)), buf306, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf307, (128, 168, 28, 28), (131712, 784, 28, 1)) | |
buf308 = empty_strided((168, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_73.run(primals_163, buf308, 4704, grid=grid(4704), stream=stream0) | |
del primals_163 | |
buf309 = extern_kernels.convolution(as_strided(buf305, (128, 28, 28, 28), (43904, 784, 28, 1), 21952), buf308, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf309, (128, 168, 28, 28), (131712, 784, 28, 1)) | |
buf312 = buf287; del buf287 # reuse | |
buf310 = as_strided(buf312, (128, 168, 28, 28), (263424, 784, 28, 1)) # alias | |
triton_poi_fused_cat_74.run(buf307, buf310, 16859136, grid=grid(16859136), stream=stream0) | |
buf311 = as_strided(buf312, (128, 168, 28, 28), (263424, 784, 28, 1), 131712) # alias | |
triton_poi_fused_cat_74.run(buf309, buf311, 16859136, grid=grid(16859136), stream=stream0) | |
buf313 = buf271; del buf271 # reuse | |
buf314 = buf313; del buf313 # reuse | |
buf317 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32) | |
buf315 = empty_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda', dtype=torch.float32) | |
buf316 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32) | |
buf318 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_75.run(buf314, buf312, primals_361, primals_362, buf317, buf315, buf316, buf318, 336, 100352, grid=grid(336), stream=stream0) | |
del primals_361 | |
del primals_362 | |
buf319 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16) | |
buf1146 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76.run(buf312, buf314, buf315, primals_37, primals_38, buf319, buf1146, 33718272, grid=grid(33718272), stream=stream0) | |
del primals_38 | |
buf320 = empty_strided((168, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_77.run(primals_164, buf320, 1512, grid=grid(1512), stream=stream0) | |
del primals_164 | |
buf321 = buf309; del buf309 # reuse | |
triton_poi_fused_split_with_sizes_78.run(buf319, buf321, 16859136, grid=grid(16859136), stream=stream0) | |
buf322 = extern_kernels.convolution(buf321, buf320, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=168, bias=None) | |
assert_size_stride(buf322, (128, 168, 28, 28), (131712, 784, 28, 1)) | |
buf323 = empty_strided((168, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_79.run(primals_165, buf323, 4200, grid=grid(4200), stream=stream0) | |
del primals_165 | |
buf324 = buf307; del buf307 # reuse | |
triton_poi_fused_split_with_sizes_80.run(buf319, buf324, 16859136, grid=grid(16859136), stream=stream0) | |
buf325 = extern_kernels.convolution(buf324, buf323, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=168, bias=None) | |
assert_size_stride(buf325, (128, 168, 28, 28), (131712, 784, 28, 1)) | |
buf328 = buf319; del buf319 # reuse | |
buf326 = as_strided(buf328, (128, 168, 28, 28), (263424, 784, 28, 1)) # alias | |
triton_poi_fused_cat_74.run(buf322, buf326, 16859136, grid=grid(16859136), stream=stream0) | |
buf327 = as_strided(buf328, (128, 168, 28, 28), (263424, 784, 28, 1), 131712) # alias | |
triton_poi_fused_cat_74.run(buf325, buf327, 16859136, grid=grid(16859136), stream=stream0) | |
buf329 = buf315; del buf315 # reuse | |
buf330 = buf329; del buf329 # reuse | |
buf333 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32) | |
buf331 = empty_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda', dtype=torch.float32) | |
buf332 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32) | |
buf334 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_75.run(buf330, buf328, primals_364, primals_365, buf333, buf331, buf332, buf334, 336, 100352, grid=grid(336), stream=stream0) | |
del primals_364 | |
del primals_365 | |
buf335 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16) | |
buf337 = empty_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_81.run(buf328, buf330, buf331, primals_39, primals_40, buf335, buf337, 43008, 784, grid=grid(43008), stream=stream0) | |
del primals_40 | |
buf338 = empty_strided((28, 336, 1, 1), (336, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_82.run(primals_166, buf338, 9408, grid=grid(9408), stream=stream0) | |
del primals_166 | |
buf339 = buf279; del buf279 # reuse | |
triton_poi_fused__to_copy_convolution_83.run(primals_167, buf339, 28, grid=grid(28), stream=stream0) | |
del primals_167 | |
buf340 = extern_kernels.convolution(buf337, buf338, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf340, (128, 28, 1, 1), (28, 1, 1, 1)) | |
buf341 = buf340; del buf340 # reuse | |
buf342 = empty_strided((128, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_convolution_silu_84.run(buf341, buf339, buf342, 3584, grid=grid(3584), stream=stream0) | |
del buf339 | |
buf343 = empty_strided((336, 28, 1, 1), (28, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_85.run(primals_168, buf343, 9408, grid=grid(9408), stream=stream0) | |
del primals_168 | |
buf344 = buf284; del buf284 # reuse | |
triton_poi_fused__to_copy_convolution_86.run(primals_169, buf344, 336, grid=grid(336), stream=stream0) | |
del primals_169 | |
buf345 = extern_kernels.convolution(buf342, buf343, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf345, (128, 336, 1, 1), (336, 1, 1, 1)) | |
buf346 = buf345; del buf345 # reuse | |
triton_poi_fused__to_copy_convolution_87.run(buf346, buf344, 43008, grid=grid(43008), stream=stream0) | |
buf347 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_mul_sigmoid_silu_88.run(buf335, buf346, buf347, 33718272, grid=grid(33718272), stream=stream0) | |
buf348 = buf325; del buf325 # reuse | |
triton_poi_fused_split_with_sizes_89.run(buf347, buf348, 16859136, grid=grid(16859136), stream=stream0) | |
buf349 = buf322; del buf322 # reuse | |
triton_poi_fused_split_with_sizes_90.run(buf347, buf349, 16859136, grid=grid(16859136), stream=stream0) | |
buf350 = empty_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_91.run(primals_170, buf350, 4704, grid=grid(4704), stream=stream0) | |
del primals_170 | |
buf351 = extern_kernels.convolution(buf348, buf350, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf351, (128, 28, 28, 28), (21952, 784, 28, 1)) | |
buf352 = empty_strided((28, 168, 1, 1), (168, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_91.run(primals_171, buf352, 4704, grid=grid(4704), stream=stream0) | |
del primals_171 | |
buf353 = extern_kernels.convolution(buf349, buf352, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf353, (128, 28, 28, 28), (21952, 784, 28, 1)) | |
buf356 = empty_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda', dtype=torch.float16) | |
buf354 = as_strided(buf356, (128, 28, 28, 28), (43904, 784, 28, 1)) # alias | |
triton_poi_fused_cat_92.run(buf351, buf354, 2809856, grid=grid(2809856), stream=stream0) | |
del buf351 | |
buf355 = as_strided(buf356, (128, 28, 28, 28), (43904, 784, 28, 1), 21952) # alias | |
triton_poi_fused_cat_92.run(buf353, buf355, 2809856, grid=grid(2809856), stream=stream0) | |
del buf353 | |
buf357 = buf300; del buf300 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_68.run(buf356, buf357, 728, 7720, grid=grid(728), stream=stream0) | |
buf358 = buf301; del buf301 # reuse | |
buf359 = buf358; del buf358 # reuse | |
buf363 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_69.run(buf359, buf357, primals_367, buf363, 56, 13, grid=grid(56), stream=stream0) | |
del primals_367 | |
buf360 = buf357; del buf357 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_70.run(buf356, buf359, buf360, 728, 7720, grid=grid(728), stream=stream0) | |
buf361 = empty_strided((1, 56, 1, 1), (56, 1, 56, 56), device='cuda', dtype=torch.float32) | |
buf362 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32) | |
buf364 = empty_strided((56, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_71.run(buf360, primals_368, buf361, buf362, buf364, 56, 13, grid=grid(56), stream=stream0) | |
del buf360 | |
del primals_368 | |
buf365 = empty_strided((128, 56, 28, 28), (43904, 784, 28, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_93.run(buf356, buf359, buf361, primals_41, primals_42, buf305, buf365, 5619712, grid=grid(5619712), stream=stream0) | |
del buf361 | |
del primals_42 | |
buf366 = empty_strided((336, 56, 1, 1), (56, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_94.run(primals_172, buf366, 18816, grid=grid(18816), stream=stream0) | |
del primals_172 | |
buf367 = extern_kernels.convolution(buf365, buf366, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf367, (128, 336, 28, 28), (263424, 784, 28, 1)) | |
buf368 = buf331; del buf331 # reuse | |
buf369 = buf368; del buf368 # reuse | |
buf372 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32) | |
buf370 = empty_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda', dtype=torch.float32) | |
buf371 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32) | |
buf373 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_75.run(buf369, buf367, primals_370, primals_371, buf372, buf370, buf371, buf373, 336, 100352, grid=grid(336), stream=stream0) | |
del primals_370 | |
del primals_371 | |
buf374 = buf347; del buf347 # reuse | |
buf1144 = empty_strided((128, 336, 28, 28), (263424, 784, 28, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_76.run(buf367, buf369, buf370, primals_43, primals_44, buf374, buf1144, 33718272, grid=grid(33718272), stream=stream0) | |
del primals_44 | |
buf375 = empty_strided((112, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_95.run(primals_173, buf375, 1008, grid=grid(1008), stream=stream0) | |
del primals_173 | |
buf376 = empty_strided((128, 112, 28, 28), (87808, 784, 28, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_split_with_sizes_96.run(buf374, buf376, 11239424, grid=grid(11239424), stream=stream0) | |
buf377 = extern_kernels.convolution(buf376, buf375, stride=(2, 2), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=112, bias=None) | |
assert_size_stride(buf377, (128, 112, 14, 14), (21952, 196, 14, 1)) | |
buf378 = empty_strided((112, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_97.run(primals_174, buf378, 2800, grid=grid(2800), stream=stream0) | |
del primals_174 | |
buf379 = empty_strided((128, 112, 28, 28), (87808, 784, 28, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_split_with_sizes_98.run(buf374, buf379, 11239424, grid=grid(11239424), stream=stream0) | |
buf380 = extern_kernels.convolution(buf379, buf378, stride=(2, 2), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=112, bias=None) | |
assert_size_stride(buf380, (128, 112, 14, 14), (21952, 196, 14, 1)) | |
buf381 = empty_strided((112, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_99.run(primals_175, buf381, 5488, grid=grid(5488), stream=stream0) | |
del primals_175 | |
buf382 = empty_strided((128, 112, 28, 28), (87808, 784, 28, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_split_with_sizes_100.run(buf374, buf382, 11239424, grid=grid(11239424), stream=stream0) | |
del buf374 | |
buf383 = extern_kernels.convolution(buf382, buf381, stride=(2, 2), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=112, bias=None) | |
assert_size_stride(buf383, (128, 112, 14, 14), (21952, 196, 14, 1)) | |
buf387 = empty_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda', dtype=torch.float16) | |
buf384 = as_strided(buf387, (128, 112, 14, 14), (65856, 196, 14, 1)) # alias | |
triton_poi_fused_cat_101.run(buf377, buf384, 2809856, grid=grid(2809856), stream=stream0) | |
del buf377 | |
buf385 = as_strided(buf387, (128, 112, 14, 14), (65856, 196, 14, 1), 21952) # alias | |
triton_poi_fused_cat_101.run(buf380, buf385, 2809856, grid=grid(2809856), stream=stream0) | |
del buf380 | |
buf386 = as_strided(buf387, (128, 112, 14, 14), (65856, 196, 14, 1), 43904) # alias | |
triton_poi_fused_cat_101.run(buf383, buf386, 2809856, grid=grid(2809856), stream=stream0) | |
del buf383 | |
buf388 = buf370; del buf370 # reuse | |
buf389 = buf388; del buf388 # reuse | |
buf392 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32) | |
buf390 = empty_strided((1, 336, 1, 1), (336, 1, 336, 336), device='cuda', dtype=torch.float32) | |
buf391 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32) | |
buf393 = empty_strided((336, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_102.run(buf389, buf387, primals_373, primals_374, buf392, buf390, buf391, buf393, 336, 25088, grid=grid(336), stream=stream0) | |
del primals_373 | |
del primals_374 | |
buf394 = empty_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda', dtype=torch.float16) | |
buf396 = empty_strided((128, 336, 1, 1), (336, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_103.run(buf387, buf389, buf390, primals_45, primals_46, buf394, buf396, 43008, 196, grid=grid(43008), stream=stream0) | |
del buf390 | |
del primals_46 | |
buf397 = empty_strided((14, 336, 1, 1), (336, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_104.run(primals_176, buf397, 4704, grid=grid(4704), stream=stream0) | |
del primals_176 | |
buf398 = empty_strided((14, ), (1, ), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_convolution_105.run(primals_177, buf398, 14, grid=grid(14), stream=stream0) | |
del primals_177 | |
buf399 = extern_kernels.convolution(buf396, buf397, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf399, (128, 14, 1, 1), (14, 1, 1, 1)) | |
buf400 = buf399; del buf399 # reuse | |
buf401 = empty_strided((128, 14, 1, 1), (14, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_convolution_silu_106.run(buf400, buf398, buf401, 1792, grid=grid(1792), stream=stream0) | |
del buf398 | |
buf402 = empty_strided((336, 14, 1, 1), (14, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_107.run(primals_178, buf402, 4704, grid=grid(4704), stream=stream0) | |
del primals_178 | |
buf403 = buf344; del buf344 # reuse | |
triton_poi_fused__to_copy_convolution_86.run(primals_179, buf403, 336, grid=grid(336), stream=stream0) | |
del primals_179 | |
buf404 = extern_kernels.convolution(buf401, buf402, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf404, (128, 336, 1, 1), (336, 1, 1, 1)) | |
buf405 = buf404; del buf404 # reuse | |
triton_poi_fused__to_copy_convolution_87.run(buf405, buf403, 43008, grid=grid(43008), stream=stream0) | |
del buf403 | |
buf406 = empty_strided((128, 336, 14, 14), (65856, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_mul_sigmoid_silu_108.run(buf394, buf405, buf406, 8429568, grid=grid(8429568), stream=stream0) | |
buf407 = empty_strided((104, 336, 1, 1), (336, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_109.run(primals_180, buf407, 34944, grid=grid(34944), stream=stream0) | |
del primals_180 | |
buf408 = extern_kernels.convolution(buf406, buf407, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf408, (128, 104, 14, 14), (20384, 196, 14, 1)) | |
buf409 = empty_strided((1, 104, 1, 1, 4), (416, 1, 416, 416, 104), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_110.run(buf408, buf409, 416, 6272, grid=grid(416), stream=stream0) | |
buf410 = empty_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda', dtype=torch.float32) | |
buf411 = buf410; del buf410 # reuse | |
buf415 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_111.run(buf411, buf409, primals_376, buf415, 104, 4, grid=grid(104), stream=stream0) | |
del primals_376 | |
buf412 = buf409; del buf409 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_112.run(buf408, buf411, buf412, 416, 6272, grid=grid(416), stream=stream0) | |
buf413 = empty_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda', dtype=torch.float32) | |
buf414 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32) | |
buf416 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_113.run(buf412, primals_377, buf413, buf414, buf416, 104, 4, grid=grid(104), stream=stream0) | |
del primals_377 | |
buf417 = empty_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_114.run(buf408, buf411, buf413, primals_47, primals_48, buf417, 2609152, grid=grid(2609152), stream=stream0) | |
del primals_48 | |
buf418 = empty_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_115.run(primals_181, buf418, 16224, grid=grid(16224), stream=stream0) | |
del primals_181 | |
buf419 = extern_kernels.convolution(as_strided(buf417, (128, 52, 14, 14), (20384, 196, 14, 1)), buf418, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf419, (128, 312, 14, 14), (61152, 196, 14, 1)) | |
buf420 = empty_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_115.run(primals_182, buf420, 16224, grid=grid(16224), stream=stream0) | |
del primals_182 | |
buf421 = extern_kernels.convolution(as_strided(buf417, (128, 52, 14, 14), (20384, 196, 14, 1), 10192), buf420, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf421, (128, 312, 14, 14), (61152, 196, 14, 1)) | |
buf424 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16) | |
buf422 = as_strided(buf424, (128, 312, 14, 14), (122304, 196, 14, 1)) # alias | |
triton_poi_fused_cat_116.run(buf419, buf422, 7827456, grid=grid(7827456), stream=stream0) | |
buf423 = as_strided(buf424, (128, 312, 14, 14), (122304, 196, 14, 1), 61152) # alias | |
triton_poi_fused_cat_116.run(buf421, buf423, 7827456, grid=grid(7827456), stream=stream0) | |
buf425 = empty_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda', dtype=torch.float32) | |
buf426 = buf425; del buf425 # reuse | |
buf429 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32) | |
buf427 = empty_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda', dtype=torch.float32) | |
buf428 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32) | |
buf430 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_117.run(buf426, buf424, primals_379, primals_380, buf429, buf427, buf428, buf430, 624, 25088, grid=grid(624), stream=stream0) | |
del primals_379 | |
del primals_380 | |
buf431 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16) | |
buf1142 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_118.run(buf424, buf426, buf427, primals_49, primals_50, buf431, buf1142, 15654912, grid=grid(15654912), stream=stream0) | |
del primals_50 | |
buf432 = empty_strided((156, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_119.run(primals_183, buf432, 1404, grid=grid(1404), stream=stream0) | |
del primals_183 | |
buf433 = empty_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_split_with_sizes_120.run(buf431, buf433, 3913728, grid=grid(3913728), stream=stream0) | |
buf434 = extern_kernels.convolution(buf433, buf432, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None) | |
assert_size_stride(buf434, (128, 156, 14, 14), (30576, 196, 14, 1)) | |
buf435 = empty_strided((156, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_121.run(primals_184, buf435, 3900, grid=grid(3900), stream=stream0) | |
del primals_184 | |
buf436 = empty_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_split_with_sizes_122.run(buf431, buf436, 3913728, grid=grid(3913728), stream=stream0) | |
buf437 = extern_kernels.convolution(buf436, buf435, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None) | |
assert_size_stride(buf437, (128, 156, 14, 14), (30576, 196, 14, 1)) | |
buf438 = empty_strided((156, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_123.run(primals_185, buf438, 7644, grid=grid(7644), stream=stream0) | |
del primals_185 | |
buf439 = empty_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_split_with_sizes_124.run(buf431, buf439, 3913728, grid=grid(3913728), stream=stream0) | |
buf440 = extern_kernels.convolution(buf439, buf438, stride=(1, 1), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None) | |
assert_size_stride(buf440, (128, 156, 14, 14), (30576, 196, 14, 1)) | |
buf441 = empty_strided((156, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_125.run(primals_186, buf441, 12636, grid=grid(12636), stream=stream0) | |
del primals_186 | |
buf442 = empty_strided((128, 156, 14, 14), (30576, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_split_with_sizes_126.run(buf431, buf442, 3913728, grid=grid(3913728), stream=stream0) | |
buf443 = extern_kernels.convolution(buf442, buf441, stride=(1, 1), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None) | |
assert_size_stride(buf443, (128, 156, 14, 14), (30576, 196, 14, 1)) | |
buf448 = buf431; del buf431 # reuse | |
buf444 = as_strided(buf448, (128, 156, 14, 14), (122304, 196, 14, 1)) # alias | |
triton_poi_fused_cat_127.run(buf434, buf444, 3913728, grid=grid(3913728), stream=stream0) | |
buf445 = as_strided(buf448, (128, 156, 14, 14), (122304, 196, 14, 1), 30576) # alias | |
triton_poi_fused_cat_127.run(buf437, buf445, 3913728, grid=grid(3913728), stream=stream0) | |
buf446 = as_strided(buf448, (128, 156, 14, 14), (122304, 196, 14, 1), 61152) # alias | |
triton_poi_fused_cat_127.run(buf440, buf446, 3913728, grid=grid(3913728), stream=stream0) | |
buf447 = as_strided(buf448, (128, 156, 14, 14), (122304, 196, 14, 1), 91728) # alias | |
triton_poi_fused_cat_127.run(buf443, buf447, 3913728, grid=grid(3913728), stream=stream0) | |
buf449 = buf427; del buf427 # reuse | |
buf450 = buf449; del buf449 # reuse | |
buf453 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32) | |
buf451 = empty_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda', dtype=torch.float32) | |
buf452 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32) | |
buf454 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_117.run(buf450, buf448, primals_382, primals_383, buf453, buf451, buf452, buf454, 624, 25088, grid=grid(624), stream=stream0) | |
del primals_382 | |
del primals_383 | |
buf455 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16) | |
buf457 = empty_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_128.run(buf448, buf450, buf451, primals_51, primals_52, buf455, buf457, 79872, 196, grid=grid(79872), stream=stream0) | |
del primals_52 | |
buf458 = empty_strided((26, 624, 1, 1), (624, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_129.run(primals_187, buf458, 16224, grid=grid(16224), stream=stream0) | |
del primals_187 | |
buf459 = empty_strided((26, ), (1, ), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_convolution_130.run(primals_188, buf459, 26, grid=grid(26), stream=stream0) | |
del primals_188 | |
buf460 = extern_kernels.convolution(buf457, buf458, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf460, (128, 26, 1, 1), (26, 1, 1, 1)) | |
buf461 = buf460; del buf460 # reuse | |
buf462 = empty_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_convolution_silu_131.run(buf461, buf459, buf462, 3328, grid=grid(3328), stream=stream0) | |
buf463 = empty_strided((624, 26, 1, 1), (26, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_132.run(primals_189, buf463, 16224, grid=grid(16224), stream=stream0) | |
del primals_189 | |
buf464 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_convolution_133.run(primals_190, buf464, 624, grid=grid(624), stream=stream0) | |
del primals_190 | |
buf465 = extern_kernels.convolution(buf462, buf463, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf465, (128, 624, 1, 1), (624, 1, 1, 1)) | |
buf466 = buf465; del buf465 # reuse | |
triton_poi_fused__to_copy_convolution_134.run(buf466, buf464, 79872, grid=grid(79872), stream=stream0) | |
buf467 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_mul_sigmoid_silu_135.run(buf455, buf466, buf467, 15654912, grid=grid(15654912), stream=stream0) | |
buf468 = buf421; del buf421 # reuse | |
triton_poi_fused_split_with_sizes_136.run(buf467, buf468, 7827456, grid=grid(7827456), stream=stream0) | |
buf469 = buf419; del buf419 # reuse | |
triton_poi_fused_split_with_sizes_137.run(buf467, buf469, 7827456, grid=grid(7827456), stream=stream0) | |
buf470 = empty_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_138.run(primals_191, buf470, 16224, grid=grid(16224), stream=stream0) | |
del primals_191 | |
buf471 = extern_kernels.convolution(buf468, buf470, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf471, (128, 52, 14, 14), (10192, 196, 14, 1)) | |
buf472 = empty_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_138.run(primals_192, buf472, 16224, grid=grid(16224), stream=stream0) | |
del primals_192 | |
buf473 = extern_kernels.convolution(buf469, buf472, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf473, (128, 52, 14, 14), (10192, 196, 14, 1)) | |
buf476 = empty_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda', dtype=torch.float16) | |
buf474 = as_strided(buf476, (128, 52, 14, 14), (20384, 196, 14, 1)) # alias | |
triton_poi_fused_cat_139.run(buf471, buf474, 1304576, grid=grid(1304576), stream=stream0) | |
del buf471 | |
buf475 = as_strided(buf476, (128, 52, 14, 14), (20384, 196, 14, 1), 10192) # alias | |
triton_poi_fused_cat_139.run(buf473, buf475, 1304576, grid=grid(1304576), stream=stream0) | |
del buf473 | |
buf477 = buf412; del buf412 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_110.run(buf476, buf477, 416, 6272, grid=grid(416), stream=stream0) | |
buf478 = buf413; del buf413 # reuse | |
buf479 = buf478; del buf478 # reuse | |
buf483 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_111.run(buf479, buf477, primals_385, buf483, 104, 4, grid=grid(104), stream=stream0) | |
del primals_385 | |
buf480 = buf477; del buf477 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_112.run(buf476, buf479, buf480, 416, 6272, grid=grid(416), stream=stream0) | |
buf481 = empty_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda', dtype=torch.float32) | |
buf482 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32) | |
buf484 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_113.run(buf480, primals_386, buf481, buf482, buf484, 104, 4, grid=grid(104), stream=stream0) | |
del primals_386 | |
buf485 = empty_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_140.run(buf476, buf479, buf481, primals_53, primals_54, buf417, buf485, 2609152, grid=grid(2609152), stream=stream0) | |
del primals_54 | |
buf486 = empty_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_115.run(primals_193, buf486, 16224, grid=grid(16224), stream=stream0) | |
del primals_193 | |
buf487 = extern_kernels.convolution(as_strided(buf485, (128, 52, 14, 14), (20384, 196, 14, 1)), buf486, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf487, (128, 312, 14, 14), (61152, 196, 14, 1)) | |
buf488 = empty_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_115.run(primals_194, buf488, 16224, grid=grid(16224), stream=stream0) | |
del primals_194 | |
buf489 = extern_kernels.convolution(as_strided(buf485, (128, 52, 14, 14), (20384, 196, 14, 1), 10192), buf488, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf489, (128, 312, 14, 14), (61152, 196, 14, 1)) | |
buf492 = buf467; del buf467 # reuse | |
buf490 = as_strided(buf492, (128, 312, 14, 14), (122304, 196, 14, 1)) # alias | |
triton_poi_fused_cat_116.run(buf487, buf490, 7827456, grid=grid(7827456), stream=stream0) | |
buf491 = as_strided(buf492, (128, 312, 14, 14), (122304, 196, 14, 1), 61152) # alias | |
triton_poi_fused_cat_116.run(buf489, buf491, 7827456, grid=grid(7827456), stream=stream0) | |
buf493 = buf451; del buf451 # reuse | |
buf494 = buf493; del buf493 # reuse | |
buf497 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32) | |
buf495 = empty_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda', dtype=torch.float32) | |
buf496 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32) | |
buf498 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_117.run(buf494, buf492, primals_388, primals_389, buf497, buf495, buf496, buf498, 624, 25088, grid=grid(624), stream=stream0) | |
del primals_388 | |
del primals_389 | |
buf499 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16) | |
buf1140 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_118.run(buf492, buf494, buf495, primals_55, primals_56, buf499, buf1140, 15654912, grid=grid(15654912), stream=stream0) | |
del primals_56 | |
buf500 = empty_strided((156, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_119.run(primals_195, buf500, 1404, grid=grid(1404), stream=stream0) | |
del primals_195 | |
buf501 = buf443; del buf443 # reuse | |
triton_poi_fused_split_with_sizes_120.run(buf499, buf501, 3913728, grid=grid(3913728), stream=stream0) | |
buf502 = extern_kernels.convolution(buf501, buf500, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None) | |
assert_size_stride(buf502, (128, 156, 14, 14), (30576, 196, 14, 1)) | |
buf503 = empty_strided((156, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_121.run(primals_196, buf503, 3900, grid=grid(3900), stream=stream0) | |
del primals_196 | |
buf504 = buf440; del buf440 # reuse | |
triton_poi_fused_split_with_sizes_122.run(buf499, buf504, 3913728, grid=grid(3913728), stream=stream0) | |
buf505 = extern_kernels.convolution(buf504, buf503, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None) | |
assert_size_stride(buf505, (128, 156, 14, 14), (30576, 196, 14, 1)) | |
buf506 = empty_strided((156, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_123.run(primals_197, buf506, 7644, grid=grid(7644), stream=stream0) | |
del primals_197 | |
buf507 = buf437; del buf437 # reuse | |
triton_poi_fused_split_with_sizes_124.run(buf499, buf507, 3913728, grid=grid(3913728), stream=stream0) | |
buf508 = extern_kernels.convolution(buf507, buf506, stride=(1, 1), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None) | |
assert_size_stride(buf508, (128, 156, 14, 14), (30576, 196, 14, 1)) | |
buf509 = empty_strided((156, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_125.run(primals_198, buf509, 12636, grid=grid(12636), stream=stream0) | |
del primals_198 | |
buf510 = buf434; del buf434 # reuse | |
triton_poi_fused_split_with_sizes_126.run(buf499, buf510, 3913728, grid=grid(3913728), stream=stream0) | |
buf511 = extern_kernels.convolution(buf510, buf509, stride=(1, 1), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None) | |
assert_size_stride(buf511, (128, 156, 14, 14), (30576, 196, 14, 1)) | |
buf516 = buf499; del buf499 # reuse | |
buf512 = as_strided(buf516, (128, 156, 14, 14), (122304, 196, 14, 1)) # alias | |
triton_poi_fused_cat_127.run(buf502, buf512, 3913728, grid=grid(3913728), stream=stream0) | |
buf513 = as_strided(buf516, (128, 156, 14, 14), (122304, 196, 14, 1), 30576) # alias | |
triton_poi_fused_cat_127.run(buf505, buf513, 3913728, grid=grid(3913728), stream=stream0) | |
buf514 = as_strided(buf516, (128, 156, 14, 14), (122304, 196, 14, 1), 61152) # alias | |
triton_poi_fused_cat_127.run(buf508, buf514, 3913728, grid=grid(3913728), stream=stream0) | |
buf515 = as_strided(buf516, (128, 156, 14, 14), (122304, 196, 14, 1), 91728) # alias | |
triton_poi_fused_cat_127.run(buf511, buf515, 3913728, grid=grid(3913728), stream=stream0) | |
buf517 = buf495; del buf495 # reuse | |
buf518 = buf517; del buf517 # reuse | |
buf521 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32) | |
buf519 = empty_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda', dtype=torch.float32) | |
buf520 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32) | |
buf522 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_117.run(buf518, buf516, primals_391, primals_392, buf521, buf519, buf520, buf522, 624, 25088, grid=grid(624), stream=stream0) | |
del primals_391 | |
del primals_392 | |
buf523 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16) | |
buf525 = empty_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_128.run(buf516, buf518, buf519, primals_57, primals_58, buf523, buf525, 79872, 196, grid=grid(79872), stream=stream0) | |
del primals_58 | |
buf526 = empty_strided((26, 624, 1, 1), (624, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_129.run(primals_199, buf526, 16224, grid=grid(16224), stream=stream0) | |
del primals_199 | |
buf527 = buf459; del buf459 # reuse | |
triton_poi_fused__to_copy_convolution_130.run(primals_200, buf527, 26, grid=grid(26), stream=stream0) | |
del primals_200 | |
buf528 = extern_kernels.convolution(buf525, buf526, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf528, (128, 26, 1, 1), (26, 1, 1, 1)) | |
buf529 = buf528; del buf528 # reuse | |
buf530 = empty_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_convolution_silu_131.run(buf529, buf527, buf530, 3328, grid=grid(3328), stream=stream0) | |
buf531 = empty_strided((624, 26, 1, 1), (26, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_132.run(primals_201, buf531, 16224, grid=grid(16224), stream=stream0) | |
del primals_201 | |
buf532 = buf464; del buf464 # reuse | |
triton_poi_fused__to_copy_convolution_133.run(primals_202, buf532, 624, grid=grid(624), stream=stream0) | |
del primals_202 | |
buf533 = extern_kernels.convolution(buf530, buf531, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf533, (128, 624, 1, 1), (624, 1, 1, 1)) | |
buf534 = buf533; del buf533 # reuse | |
triton_poi_fused__to_copy_convolution_134.run(buf534, buf532, 79872, grid=grid(79872), stream=stream0) | |
buf535 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_mul_sigmoid_silu_135.run(buf523, buf534, buf535, 15654912, grid=grid(15654912), stream=stream0) | |
buf536 = buf489; del buf489 # reuse | |
triton_poi_fused_split_with_sizes_136.run(buf535, buf536, 7827456, grid=grid(7827456), stream=stream0) | |
buf537 = buf487; del buf487 # reuse | |
triton_poi_fused_split_with_sizes_137.run(buf535, buf537, 7827456, grid=grid(7827456), stream=stream0) | |
buf538 = empty_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_138.run(primals_203, buf538, 16224, grid=grid(16224), stream=stream0) | |
del primals_203 | |
buf539 = extern_kernels.convolution(buf536, buf538, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf539, (128, 52, 14, 14), (10192, 196, 14, 1)) | |
buf540 = empty_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_138.run(primals_204, buf540, 16224, grid=grid(16224), stream=stream0) | |
del primals_204 | |
buf541 = extern_kernels.convolution(buf537, buf540, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf541, (128, 52, 14, 14), (10192, 196, 14, 1)) | |
buf544 = empty_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda', dtype=torch.float16) | |
buf542 = as_strided(buf544, (128, 52, 14, 14), (20384, 196, 14, 1)) # alias | |
triton_poi_fused_cat_139.run(buf539, buf542, 1304576, grid=grid(1304576), stream=stream0) | |
del buf539 | |
buf543 = as_strided(buf544, (128, 52, 14, 14), (20384, 196, 14, 1), 10192) # alias | |
triton_poi_fused_cat_139.run(buf541, buf543, 1304576, grid=grid(1304576), stream=stream0) | |
del buf541 | |
buf545 = buf480; del buf480 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_110.run(buf544, buf545, 416, 6272, grid=grid(416), stream=stream0) | |
buf546 = buf481; del buf481 # reuse | |
buf547 = buf546; del buf546 # reuse | |
buf551 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_111.run(buf547, buf545, primals_394, buf551, 104, 4, grid=grid(104), stream=stream0) | |
del primals_394 | |
buf548 = buf545; del buf545 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_112.run(buf544, buf547, buf548, 416, 6272, grid=grid(416), stream=stream0) | |
buf549 = empty_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda', dtype=torch.float32) | |
buf550 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32) | |
buf552 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_113.run(buf548, primals_395, buf549, buf550, buf552, 104, 4, grid=grid(104), stream=stream0) | |
del primals_395 | |
buf553 = empty_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_140.run(buf544, buf547, buf549, primals_59, primals_60, buf485, buf553, 2609152, grid=grid(2609152), stream=stream0) | |
del primals_60 | |
buf554 = empty_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_115.run(primals_205, buf554, 16224, grid=grid(16224), stream=stream0) | |
del primals_205 | |
buf555 = extern_kernels.convolution(as_strided(buf553, (128, 52, 14, 14), (20384, 196, 14, 1)), buf554, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf555, (128, 312, 14, 14), (61152, 196, 14, 1)) | |
buf556 = empty_strided((312, 52, 1, 1), (52, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_115.run(primals_206, buf556, 16224, grid=grid(16224), stream=stream0) | |
del primals_206 | |
buf557 = extern_kernels.convolution(as_strided(buf553, (128, 52, 14, 14), (20384, 196, 14, 1), 10192), buf556, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf557, (128, 312, 14, 14), (61152, 196, 14, 1)) | |
buf560 = buf535; del buf535 # reuse | |
buf558 = as_strided(buf560, (128, 312, 14, 14), (122304, 196, 14, 1)) # alias | |
triton_poi_fused_cat_116.run(buf555, buf558, 7827456, grid=grid(7827456), stream=stream0) | |
buf559 = as_strided(buf560, (128, 312, 14, 14), (122304, 196, 14, 1), 61152) # alias | |
triton_poi_fused_cat_116.run(buf557, buf559, 7827456, grid=grid(7827456), stream=stream0) | |
buf561 = buf519; del buf519 # reuse | |
buf562 = buf561; del buf561 # reuse | |
buf565 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32) | |
buf563 = empty_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda', dtype=torch.float32) | |
buf564 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32) | |
buf566 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_117.run(buf562, buf560, primals_397, primals_398, buf565, buf563, buf564, buf566, 624, 25088, grid=grid(624), stream=stream0) | |
del primals_397 | |
del primals_398 | |
buf567 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16) | |
buf1138 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_118.run(buf560, buf562, buf563, primals_61, primals_62, buf567, buf1138, 15654912, grid=grid(15654912), stream=stream0) | |
del primals_62 | |
buf568 = empty_strided((156, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_119.run(primals_207, buf568, 1404, grid=grid(1404), stream=stream0) | |
del primals_207 | |
buf569 = buf511; del buf511 # reuse | |
triton_poi_fused_split_with_sizes_120.run(buf567, buf569, 3913728, grid=grid(3913728), stream=stream0) | |
buf570 = extern_kernels.convolution(buf569, buf568, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None) | |
assert_size_stride(buf570, (128, 156, 14, 14), (30576, 196, 14, 1)) | |
buf571 = empty_strided((156, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_121.run(primals_208, buf571, 3900, grid=grid(3900), stream=stream0) | |
del primals_208 | |
buf572 = buf508; del buf508 # reuse | |
triton_poi_fused_split_with_sizes_122.run(buf567, buf572, 3913728, grid=grid(3913728), stream=stream0) | |
buf573 = extern_kernels.convolution(buf572, buf571, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None) | |
assert_size_stride(buf573, (128, 156, 14, 14), (30576, 196, 14, 1)) | |
buf574 = empty_strided((156, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_123.run(primals_209, buf574, 7644, grid=grid(7644), stream=stream0) | |
del primals_209 | |
buf575 = buf505; del buf505 # reuse | |
triton_poi_fused_split_with_sizes_124.run(buf567, buf575, 3913728, grid=grid(3913728), stream=stream0) | |
buf576 = extern_kernels.convolution(buf575, buf574, stride=(1, 1), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None) | |
assert_size_stride(buf576, (128, 156, 14, 14), (30576, 196, 14, 1)) | |
buf577 = empty_strided((156, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_125.run(primals_210, buf577, 12636, grid=grid(12636), stream=stream0) | |
del primals_210 | |
buf578 = buf502; del buf502 # reuse | |
triton_poi_fused_split_with_sizes_126.run(buf567, buf578, 3913728, grid=grid(3913728), stream=stream0) | |
buf579 = extern_kernels.convolution(buf578, buf577, stride=(1, 1), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=156, bias=None) | |
assert_size_stride(buf579, (128, 156, 14, 14), (30576, 196, 14, 1)) | |
buf584 = buf567; del buf567 # reuse | |
buf580 = as_strided(buf584, (128, 156, 14, 14), (122304, 196, 14, 1)) # alias | |
triton_poi_fused_cat_127.run(buf570, buf580, 3913728, grid=grid(3913728), stream=stream0) | |
del buf570 | |
buf581 = as_strided(buf584, (128, 156, 14, 14), (122304, 196, 14, 1), 30576) # alias | |
triton_poi_fused_cat_127.run(buf573, buf581, 3913728, grid=grid(3913728), stream=stream0) | |
del buf573 | |
buf582 = as_strided(buf584, (128, 156, 14, 14), (122304, 196, 14, 1), 61152) # alias | |
triton_poi_fused_cat_127.run(buf576, buf582, 3913728, grid=grid(3913728), stream=stream0) | |
del buf576 | |
buf583 = as_strided(buf584, (128, 156, 14, 14), (122304, 196, 14, 1), 91728) # alias | |
triton_poi_fused_cat_127.run(buf579, buf583, 3913728, grid=grid(3913728), stream=stream0) | |
del buf579 | |
buf585 = buf563; del buf563 # reuse | |
buf586 = buf585; del buf585 # reuse | |
buf589 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32) | |
buf587 = empty_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda', dtype=torch.float32) | |
buf588 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32) | |
buf590 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_117.run(buf586, buf584, primals_400, primals_401, buf589, buf587, buf588, buf590, 624, 25088, grid=grid(624), stream=stream0) | |
del primals_400 | |
del primals_401 | |
buf591 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16) | |
buf593 = empty_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_128.run(buf584, buf586, buf587, primals_63, primals_64, buf591, buf593, 79872, 196, grid=grid(79872), stream=stream0) | |
del primals_64 | |
buf594 = empty_strided((26, 624, 1, 1), (624, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_129.run(primals_211, buf594, 16224, grid=grid(16224), stream=stream0) | |
del primals_211 | |
buf595 = buf527; del buf527 # reuse | |
triton_poi_fused__to_copy_convolution_130.run(primals_212, buf595, 26, grid=grid(26), stream=stream0) | |
del primals_212 | |
buf596 = extern_kernels.convolution(buf593, buf594, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf596, (128, 26, 1, 1), (26, 1, 1, 1)) | |
buf597 = buf596; del buf596 # reuse | |
buf598 = empty_strided((128, 26, 1, 1), (26, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_convolution_silu_131.run(buf597, buf595, buf598, 3328, grid=grid(3328), stream=stream0) | |
del buf595 | |
buf599 = empty_strided((624, 26, 1, 1), (26, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_132.run(primals_213, buf599, 16224, grid=grid(16224), stream=stream0) | |
del primals_213 | |
buf600 = buf532; del buf532 # reuse | |
triton_poi_fused__to_copy_convolution_133.run(primals_214, buf600, 624, grid=grid(624), stream=stream0) | |
del primals_214 | |
buf601 = extern_kernels.convolution(buf598, buf599, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf601, (128, 624, 1, 1), (624, 1, 1, 1)) | |
buf602 = buf601; del buf601 # reuse | |
triton_poi_fused__to_copy_convolution_134.run(buf602, buf600, 79872, grid=grid(79872), stream=stream0) | |
buf603 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_mul_sigmoid_silu_135.run(buf591, buf602, buf603, 15654912, grid=grid(15654912), stream=stream0) | |
buf604 = buf557; del buf557 # reuse | |
triton_poi_fused_split_with_sizes_136.run(buf603, buf604, 7827456, grid=grid(7827456), stream=stream0) | |
buf605 = buf555; del buf555 # reuse | |
triton_poi_fused_split_with_sizes_137.run(buf603, buf605, 7827456, grid=grid(7827456), stream=stream0) | |
buf606 = empty_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_138.run(primals_215, buf606, 16224, grid=grid(16224), stream=stream0) | |
del primals_215 | |
buf607 = extern_kernels.convolution(buf604, buf606, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf607, (128, 52, 14, 14), (10192, 196, 14, 1)) | |
buf608 = empty_strided((52, 312, 1, 1), (312, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_138.run(primals_216, buf608, 16224, grid=grid(16224), stream=stream0) | |
del primals_216 | |
buf609 = extern_kernels.convolution(buf605, buf608, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf609, (128, 52, 14, 14), (10192, 196, 14, 1)) | |
buf612 = empty_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda', dtype=torch.float16) | |
buf610 = as_strided(buf612, (128, 52, 14, 14), (20384, 196, 14, 1)) # alias | |
triton_poi_fused_cat_139.run(buf607, buf610, 1304576, grid=grid(1304576), stream=stream0) | |
del buf607 | |
buf611 = as_strided(buf612, (128, 52, 14, 14), (20384, 196, 14, 1), 10192) # alias | |
triton_poi_fused_cat_139.run(buf609, buf611, 1304576, grid=grid(1304576), stream=stream0) | |
del buf609 | |
buf613 = buf548; del buf548 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_110.run(buf612, buf613, 416, 6272, grid=grid(416), stream=stream0) | |
buf614 = buf549; del buf549 # reuse | |
buf615 = buf614; del buf614 # reuse | |
buf619 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_111.run(buf615, buf613, primals_403, buf619, 104, 4, grid=grid(104), stream=stream0) | |
del primals_403 | |
buf616 = buf613; del buf613 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_112.run(buf612, buf615, buf616, 416, 6272, grid=grid(416), stream=stream0) | |
buf617 = empty_strided((1, 104, 1, 1), (104, 1, 104, 104), device='cuda', dtype=torch.float32) | |
buf618 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32) | |
buf620 = empty_strided((104, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_113.run(buf616, primals_404, buf617, buf618, buf620, 104, 4, grid=grid(104), stream=stream0) | |
del buf616 | |
del primals_404 | |
buf621 = empty_strided((128, 104, 14, 14), (20384, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_140.run(buf612, buf615, buf617, primals_65, primals_66, buf553, buf621, 2609152, grid=grid(2609152), stream=stream0) | |
del buf617 | |
del primals_66 | |
buf622 = empty_strided((624, 104, 1, 1), (104, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_141.run(primals_217, buf622, 64896, grid=grid(64896), stream=stream0) | |
del primals_217 | |
buf623 = extern_kernels.convolution(buf621, buf622, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf623, (128, 624, 14, 14), (122304, 196, 14, 1)) | |
buf624 = buf587; del buf587 # reuse | |
buf625 = buf624; del buf624 # reuse | |
buf628 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32) | |
buf626 = empty_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda', dtype=torch.float32) | |
buf627 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32) | |
buf629 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_117.run(buf625, buf623, primals_406, primals_407, buf628, buf626, buf627, buf629, 624, 25088, grid=grid(624), stream=stream0) | |
del primals_406 | |
del primals_407 | |
buf631 = buf603; del buf603 # reuse | |
buf1136 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_silu_sub_142.run(buf623, buf625, buf626, primals_67, primals_68, buf631, buf1136, 15654912, grid=grid(15654912), stream=stream0) | |
del primals_68 | |
buf632 = empty_strided((624, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_143.run(primals_218, buf632, 5616, grid=grid(5616), stream=stream0) | |
del primals_218 | |
buf633 = extern_kernels.convolution(buf631, buf632, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=624, bias=None) | |
assert_size_stride(buf633, (128, 624, 14, 14), (122304, 196, 14, 1)) | |
buf634 = buf626; del buf626 # reuse | |
buf635 = buf634; del buf634 # reuse | |
buf638 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32) | |
buf636 = empty_strided((1, 624, 1, 1), (624, 1, 624, 624), device='cuda', dtype=torch.float32) | |
buf637 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32) | |
buf639 = empty_strided((624, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_117.run(buf635, buf633, primals_409, primals_410, buf638, buf636, buf637, buf639, 624, 25088, grid=grid(624), stream=stream0) | |
del primals_409 | |
del primals_410 | |
buf640 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16) | |
buf642 = empty_strided((128, 624, 1, 1), (624, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_128.run(buf633, buf635, buf636, primals_69, primals_70, buf640, buf642, 79872, 196, grid=grid(79872), stream=stream0) | |
del buf636 | |
del primals_70 | |
buf643 = empty_strided((52, 624, 1, 1), (624, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_144.run(primals_219, buf643, 32448, grid=grid(32448), stream=stream0) | |
del primals_219 | |
buf644 = empty_strided((52, ), (1, ), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_convolution_145.run(primals_220, buf644, 52, grid=grid(52), stream=stream0) | |
del primals_220 | |
buf645 = extern_kernels.convolution(buf642, buf643, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf645, (128, 52, 1, 1), (52, 1, 1, 1)) | |
buf646 = buf645; del buf645 # reuse | |
buf647 = empty_strided((128, 52, 1, 1), (52, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_convolution_silu_146.run(buf646, buf644, buf647, 6656, grid=grid(6656), stream=stream0) | |
del buf644 | |
buf648 = empty_strided((624, 52, 1, 1), (52, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_147.run(primals_221, buf648, 32448, grid=grid(32448), stream=stream0) | |
del primals_221 | |
buf649 = buf600; del buf600 # reuse | |
triton_poi_fused__to_copy_convolution_133.run(primals_222, buf649, 624, grid=grid(624), stream=stream0) | |
del primals_222 | |
buf650 = extern_kernels.convolution(buf647, buf648, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf650, (128, 624, 1, 1), (624, 1, 1, 1)) | |
buf651 = buf650; del buf650 # reuse | |
triton_poi_fused__to_copy_convolution_134.run(buf651, buf649, 79872, grid=grid(79872), stream=stream0) | |
del buf649 | |
buf652 = empty_strided((128, 624, 14, 14), (122304, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_mul_sigmoid_silu_135.run(buf640, buf651, buf652, 15654912, grid=grid(15654912), stream=stream0) | |
buf653 = empty_strided((160, 624, 1, 1), (624, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_148.run(primals_223, buf653, 99840, grid=grid(99840), stream=stream0) | |
del primals_223 | |
buf654 = extern_kernels.convolution(buf652, buf653, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf654, (128, 160, 14, 14), (31360, 196, 14, 1)) | |
buf655 = empty_strided((1, 160, 1, 1, 4), (640, 1, 640, 640, 160), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_149.run(buf654, buf655, 640, 6272, grid=grid(640), stream=stream0) | |
buf656 = empty_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda', dtype=torch.float32) | |
buf657 = buf656; del buf656 # reuse | |
buf661 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_150.run(buf657, buf655, primals_412, buf661, 160, 4, grid=grid(160), stream=stream0) | |
del primals_412 | |
buf658 = buf655; del buf655 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_151.run(buf654, buf657, buf658, 640, 6272, grid=grid(640), stream=stream0) | |
buf659 = empty_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda', dtype=torch.float32) | |
buf660 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32) | |
buf662 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_152.run(buf658, primals_413, buf659, buf660, buf662, 160, 4, grid=grid(160), stream=stream0) | |
del primals_413 | |
buf663 = empty_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_153.run(buf654, buf657, buf659, primals_71, primals_72, buf663, 4014080, grid=grid(4014080), stream=stream0) | |
del primals_72 | |
buf664 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_154.run(primals_224, buf664, 19200, grid=grid(19200), stream=stream0) | |
del primals_224 | |
buf665 = extern_kernels.convolution(as_strided(buf663, (128, 80, 14, 14), (31360, 196, 14, 1)), buf664, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf665, (128, 240, 14, 14), (47040, 196, 14, 1)) | |
buf666 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_154.run(primals_225, buf666, 19200, grid=grid(19200), stream=stream0) | |
del primals_225 | |
buf667 = extern_kernels.convolution(as_strided(buf663, (128, 80, 14, 14), (31360, 196, 14, 1), 15680), buf666, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf667, (128, 240, 14, 14), (47040, 196, 14, 1)) | |
buf670 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16) | |
buf668 = as_strided(buf670, (128, 240, 14, 14), (94080, 196, 14, 1)) # alias | |
triton_poi_fused_cat_155.run(buf665, buf668, 6021120, grid=grid(6021120), stream=stream0) | |
buf669 = as_strided(buf670, (128, 240, 14, 14), (94080, 196, 14, 1), 47040) # alias | |
triton_poi_fused_cat_155.run(buf667, buf669, 6021120, grid=grid(6021120), stream=stream0) | |
buf671 = as_strided(buf108, (1, 480, 1, 1), (480, 1, 480, 480)); del buf108 # reuse | |
buf672 = buf671; del buf671 # reuse | |
buf675 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
buf673 = empty_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda', dtype=torch.float32) | |
buf674 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
buf676 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_156.run(buf672, buf670, primals_415, primals_416, buf675, buf673, buf674, buf676, 480, 25088, grid=grid(480), stream=stream0) | |
del primals_415 | |
del primals_416 | |
buf677 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16) | |
buf1134 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_157.run(buf670, buf672, buf673, primals_73, primals_74, buf677, buf1134, 12042240, grid=grid(12042240), stream=stream0) | |
del primals_74 | |
buf678 = empty_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_42.run(primals_226, buf678, 1080, grid=grid(1080), stream=stream0) | |
del primals_226 | |
buf679 = empty_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_split_with_sizes_158.run(buf677, buf679, 3010560, grid=grid(3010560), stream=stream0) | |
buf680 = extern_kernels.convolution(buf679, buf678, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None) | |
assert_size_stride(buf680, (128, 120, 14, 14), (23520, 196, 14, 1)) | |
buf681 = empty_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_159.run(primals_227, buf681, 3000, grid=grid(3000), stream=stream0) | |
del primals_227 | |
buf682 = empty_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_split_with_sizes_160.run(buf677, buf682, 3010560, grid=grid(3010560), stream=stream0) | |
buf683 = extern_kernels.convolution(buf682, buf681, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None) | |
assert_size_stride(buf683, (128, 120, 14, 14), (23520, 196, 14, 1)) | |
buf684 = empty_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_161.run(primals_228, buf684, 5880, grid=grid(5880), stream=stream0) | |
del primals_228 | |
buf685 = empty_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_split_with_sizes_162.run(buf677, buf685, 3010560, grid=grid(3010560), stream=stream0) | |
buf686 = extern_kernels.convolution(buf685, buf684, stride=(1, 1), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None) | |
assert_size_stride(buf686, (128, 120, 14, 14), (23520, 196, 14, 1)) | |
buf687 = empty_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_163.run(primals_229, buf687, 9720, grid=grid(9720), stream=stream0) | |
del primals_229 | |
buf688 = empty_strided((128, 120, 14, 14), (23520, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_split_with_sizes_164.run(buf677, buf688, 3010560, grid=grid(3010560), stream=stream0) | |
buf689 = extern_kernels.convolution(buf688, buf687, stride=(1, 1), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None) | |
assert_size_stride(buf689, (128, 120, 14, 14), (23520, 196, 14, 1)) | |
buf694 = buf677; del buf677 # reuse | |
buf690 = as_strided(buf694, (128, 120, 14, 14), (94080, 196, 14, 1)) # alias | |
triton_poi_fused_cat_165.run(buf680, buf690, 3010560, grid=grid(3010560), stream=stream0) | |
buf691 = as_strided(buf694, (128, 120, 14, 14), (94080, 196, 14, 1), 23520) # alias | |
triton_poi_fused_cat_165.run(buf683, buf691, 3010560, grid=grid(3010560), stream=stream0) | |
buf692 = as_strided(buf694, (128, 120, 14, 14), (94080, 196, 14, 1), 47040) # alias | |
triton_poi_fused_cat_165.run(buf686, buf692, 3010560, grid=grid(3010560), stream=stream0) | |
buf693 = as_strided(buf694, (128, 120, 14, 14), (94080, 196, 14, 1), 70560) # alias | |
triton_poi_fused_cat_165.run(buf689, buf693, 3010560, grid=grid(3010560), stream=stream0) | |
buf695 = buf673; del buf673 # reuse | |
buf696 = buf695; del buf695 # reuse | |
buf699 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
buf697 = empty_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda', dtype=torch.float32) | |
buf698 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
buf700 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_156.run(buf696, buf694, primals_418, primals_419, buf699, buf697, buf698, buf700, 480, 25088, grid=grid(480), stream=stream0) | |
del primals_418 | |
del primals_419 | |
buf701 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16) | |
buf703 = empty_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_166.run(buf694, buf696, buf697, primals_75, primals_76, buf701, buf703, 61440, 196, grid=grid(61440), stream=stream0) | |
del primals_76 | |
buf704 = empty_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_167.run(primals_230, buf704, 38400, grid=grid(38400), stream=stream0) | |
del primals_230 | |
buf705 = empty_strided((80, ), (1, ), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_convolution_168.run(primals_231, buf705, 80, grid=grid(80), stream=stream0) | |
del primals_231 | |
buf706 = extern_kernels.convolution(buf703, buf704, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf706, (128, 80, 1, 1), (80, 1, 1, 1)) | |
buf707 = buf706; del buf706 # reuse | |
buf708 = empty_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_convolution_silu_169.run(buf707, buf705, buf708, 10240, grid=grid(10240), stream=stream0) | |
buf709 = empty_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_170.run(primals_232, buf709, 38400, grid=grid(38400), stream=stream0) | |
del primals_232 | |
buf710 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_convolution_171.run(primals_233, buf710, 480, grid=grid(480), stream=stream0) | |
del primals_233 | |
buf711 = extern_kernels.convolution(buf708, buf709, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf711, (128, 480, 1, 1), (480, 1, 1, 1)) | |
buf712 = buf711; del buf711 # reuse | |
triton_poi_fused__to_copy_convolution_172.run(buf712, buf710, 61440, grid=grid(61440), stream=stream0) | |
buf713 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_mul_sigmoid_silu_173.run(buf701, buf712, buf713, 12042240, grid=grid(12042240), stream=stream0) | |
buf714 = buf667; del buf667 # reuse | |
triton_poi_fused_split_with_sizes_174.run(buf713, buf714, 6021120, grid=grid(6021120), stream=stream0) | |
buf715 = buf665; del buf665 # reuse | |
triton_poi_fused_split_with_sizes_175.run(buf713, buf715, 6021120, grid=grid(6021120), stream=stream0) | |
buf716 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_176.run(primals_234, buf716, 19200, grid=grid(19200), stream=stream0) | |
del primals_234 | |
buf717 = extern_kernels.convolution(buf714, buf716, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf717, (128, 80, 14, 14), (15680, 196, 14, 1)) | |
buf718 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_176.run(primals_235, buf718, 19200, grid=grid(19200), stream=stream0) | |
del primals_235 | |
buf719 = extern_kernels.convolution(buf715, buf718, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf719, (128, 80, 14, 14), (15680, 196, 14, 1)) | |
buf722 = empty_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda', dtype=torch.float16) | |
buf720 = as_strided(buf722, (128, 80, 14, 14), (31360, 196, 14, 1)) # alias | |
triton_poi_fused_cat_177.run(buf717, buf720, 2007040, grid=grid(2007040), stream=stream0) | |
del buf717 | |
buf721 = as_strided(buf722, (128, 80, 14, 14), (31360, 196, 14, 1), 15680) # alias | |
triton_poi_fused_cat_177.run(buf719, buf721, 2007040, grid=grid(2007040), stream=stream0) | |
del buf719 | |
buf723 = buf658; del buf658 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_149.run(buf722, buf723, 640, 6272, grid=grid(640), stream=stream0) | |
buf724 = buf659; del buf659 # reuse | |
buf725 = buf724; del buf724 # reuse | |
buf729 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_150.run(buf725, buf723, primals_421, buf729, 160, 4, grid=grid(160), stream=stream0) | |
del primals_421 | |
buf726 = buf723; del buf723 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_151.run(buf722, buf725, buf726, 640, 6272, grid=grid(640), stream=stream0) | |
buf727 = empty_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda', dtype=torch.float32) | |
buf728 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32) | |
buf730 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_152.run(buf726, primals_422, buf727, buf728, buf730, 160, 4, grid=grid(160), stream=stream0) | |
del primals_422 | |
buf731 = empty_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_178.run(buf722, buf725, buf727, primals_77, primals_78, buf663, buf731, 4014080, grid=grid(4014080), stream=stream0) | |
del primals_78 | |
buf732 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_154.run(primals_236, buf732, 19200, grid=grid(19200), stream=stream0) | |
del primals_236 | |
buf733 = extern_kernels.convolution(as_strided(buf731, (128, 80, 14, 14), (31360, 196, 14, 1)), buf732, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf733, (128, 240, 14, 14), (47040, 196, 14, 1)) | |
buf734 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_154.run(primals_237, buf734, 19200, grid=grid(19200), stream=stream0) | |
del primals_237 | |
buf735 = extern_kernels.convolution(as_strided(buf731, (128, 80, 14, 14), (31360, 196, 14, 1), 15680), buf734, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf735, (128, 240, 14, 14), (47040, 196, 14, 1)) | |
buf738 = buf713; del buf713 # reuse | |
buf736 = as_strided(buf738, (128, 240, 14, 14), (94080, 196, 14, 1)) # alias | |
triton_poi_fused_cat_155.run(buf733, buf736, 6021120, grid=grid(6021120), stream=stream0) | |
buf737 = as_strided(buf738, (128, 240, 14, 14), (94080, 196, 14, 1), 47040) # alias | |
triton_poi_fused_cat_155.run(buf735, buf737, 6021120, grid=grid(6021120), stream=stream0) | |
buf739 = buf697; del buf697 # reuse | |
buf740 = buf739; del buf739 # reuse | |
buf743 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
buf741 = empty_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda', dtype=torch.float32) | |
buf742 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
buf744 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_156.run(buf740, buf738, primals_424, primals_425, buf743, buf741, buf742, buf744, 480, 25088, grid=grid(480), stream=stream0) | |
del primals_424 | |
del primals_425 | |
buf745 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16) | |
buf1132 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_157.run(buf738, buf740, buf741, primals_79, primals_80, buf745, buf1132, 12042240, grid=grid(12042240), stream=stream0) | |
del primals_80 | |
buf746 = empty_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_42.run(primals_238, buf746, 1080, grid=grid(1080), stream=stream0) | |
del primals_238 | |
buf747 = buf689; del buf689 # reuse | |
triton_poi_fused_split_with_sizes_158.run(buf745, buf747, 3010560, grid=grid(3010560), stream=stream0) | |
buf748 = extern_kernels.convolution(buf747, buf746, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None) | |
assert_size_stride(buf748, (128, 120, 14, 14), (23520, 196, 14, 1)) | |
buf749 = empty_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_159.run(primals_239, buf749, 3000, grid=grid(3000), stream=stream0) | |
del primals_239 | |
buf750 = buf686; del buf686 # reuse | |
triton_poi_fused_split_with_sizes_160.run(buf745, buf750, 3010560, grid=grid(3010560), stream=stream0) | |
buf751 = extern_kernels.convolution(buf750, buf749, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None) | |
assert_size_stride(buf751, (128, 120, 14, 14), (23520, 196, 14, 1)) | |
buf752 = empty_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_161.run(primals_240, buf752, 5880, grid=grid(5880), stream=stream0) | |
del primals_240 | |
buf753 = buf683; del buf683 # reuse | |
triton_poi_fused_split_with_sizes_162.run(buf745, buf753, 3010560, grid=grid(3010560), stream=stream0) | |
buf754 = extern_kernels.convolution(buf753, buf752, stride=(1, 1), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None) | |
assert_size_stride(buf754, (128, 120, 14, 14), (23520, 196, 14, 1)) | |
buf755 = empty_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_163.run(primals_241, buf755, 9720, grid=grid(9720), stream=stream0) | |
del primals_241 | |
buf756 = buf680; del buf680 # reuse | |
triton_poi_fused_split_with_sizes_164.run(buf745, buf756, 3010560, grid=grid(3010560), stream=stream0) | |
buf757 = extern_kernels.convolution(buf756, buf755, stride=(1, 1), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None) | |
assert_size_stride(buf757, (128, 120, 14, 14), (23520, 196, 14, 1)) | |
buf762 = buf745; del buf745 # reuse | |
buf758 = as_strided(buf762, (128, 120, 14, 14), (94080, 196, 14, 1)) # alias | |
triton_poi_fused_cat_165.run(buf748, buf758, 3010560, grid=grid(3010560), stream=stream0) | |
buf759 = as_strided(buf762, (128, 120, 14, 14), (94080, 196, 14, 1), 23520) # alias | |
triton_poi_fused_cat_165.run(buf751, buf759, 3010560, grid=grid(3010560), stream=stream0) | |
buf760 = as_strided(buf762, (128, 120, 14, 14), (94080, 196, 14, 1), 47040) # alias | |
triton_poi_fused_cat_165.run(buf754, buf760, 3010560, grid=grid(3010560), stream=stream0) | |
buf761 = as_strided(buf762, (128, 120, 14, 14), (94080, 196, 14, 1), 70560) # alias | |
triton_poi_fused_cat_165.run(buf757, buf761, 3010560, grid=grid(3010560), stream=stream0) | |
buf763 = buf741; del buf741 # reuse | |
buf764 = buf763; del buf763 # reuse | |
buf767 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
buf765 = empty_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda', dtype=torch.float32) | |
buf766 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
buf768 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_156.run(buf764, buf762, primals_427, primals_428, buf767, buf765, buf766, buf768, 480, 25088, grid=grid(480), stream=stream0) | |
del primals_427 | |
del primals_428 | |
buf769 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16) | |
buf771 = empty_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_166.run(buf762, buf764, buf765, primals_81, primals_82, buf769, buf771, 61440, 196, grid=grid(61440), stream=stream0) | |
del primals_82 | |
buf772 = empty_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_167.run(primals_242, buf772, 38400, grid=grid(38400), stream=stream0) | |
del primals_242 | |
buf773 = buf705; del buf705 # reuse | |
triton_poi_fused__to_copy_convolution_168.run(primals_243, buf773, 80, grid=grid(80), stream=stream0) | |
del primals_243 | |
buf774 = extern_kernels.convolution(buf771, buf772, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf774, (128, 80, 1, 1), (80, 1, 1, 1)) | |
buf775 = buf774; del buf774 # reuse | |
buf776 = empty_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_convolution_silu_169.run(buf775, buf773, buf776, 10240, grid=grid(10240), stream=stream0) | |
buf777 = empty_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_170.run(primals_244, buf777, 38400, grid=grid(38400), stream=stream0) | |
del primals_244 | |
buf778 = buf710; del buf710 # reuse | |
triton_poi_fused__to_copy_convolution_171.run(primals_245, buf778, 480, grid=grid(480), stream=stream0) | |
del primals_245 | |
buf779 = extern_kernels.convolution(buf776, buf777, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf779, (128, 480, 1, 1), (480, 1, 1, 1)) | |
buf780 = buf779; del buf779 # reuse | |
triton_poi_fused__to_copy_convolution_172.run(buf780, buf778, 61440, grid=grid(61440), stream=stream0) | |
buf781 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_mul_sigmoid_silu_173.run(buf769, buf780, buf781, 12042240, grid=grid(12042240), stream=stream0) | |
buf782 = buf735; del buf735 # reuse | |
triton_poi_fused_split_with_sizes_174.run(buf781, buf782, 6021120, grid=grid(6021120), stream=stream0) | |
buf783 = buf733; del buf733 # reuse | |
triton_poi_fused_split_with_sizes_175.run(buf781, buf783, 6021120, grid=grid(6021120), stream=stream0) | |
buf784 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_176.run(primals_246, buf784, 19200, grid=grid(19200), stream=stream0) | |
del primals_246 | |
buf785 = extern_kernels.convolution(buf782, buf784, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf785, (128, 80, 14, 14), (15680, 196, 14, 1)) | |
buf786 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_176.run(primals_247, buf786, 19200, grid=grid(19200), stream=stream0) | |
del primals_247 | |
buf787 = extern_kernels.convolution(buf783, buf786, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf787, (128, 80, 14, 14), (15680, 196, 14, 1)) | |
buf790 = empty_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda', dtype=torch.float16) | |
buf788 = as_strided(buf790, (128, 80, 14, 14), (31360, 196, 14, 1)) # alias | |
triton_poi_fused_cat_177.run(buf785, buf788, 2007040, grid=grid(2007040), stream=stream0) | |
del buf785 | |
buf789 = as_strided(buf790, (128, 80, 14, 14), (31360, 196, 14, 1), 15680) # alias | |
triton_poi_fused_cat_177.run(buf787, buf789, 2007040, grid=grid(2007040), stream=stream0) | |
del buf787 | |
buf791 = buf726; del buf726 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_149.run(buf790, buf791, 640, 6272, grid=grid(640), stream=stream0) | |
buf792 = buf727; del buf727 # reuse | |
buf793 = buf792; del buf792 # reuse | |
buf797 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_150.run(buf793, buf791, primals_430, buf797, 160, 4, grid=grid(160), stream=stream0) | |
del primals_430 | |
buf794 = buf791; del buf791 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_151.run(buf790, buf793, buf794, 640, 6272, grid=grid(640), stream=stream0) | |
buf795 = empty_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda', dtype=torch.float32) | |
buf796 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32) | |
buf798 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_152.run(buf794, primals_431, buf795, buf796, buf798, 160, 4, grid=grid(160), stream=stream0) | |
del primals_431 | |
buf799 = empty_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_178.run(buf790, buf793, buf795, primals_83, primals_84, buf731, buf799, 4014080, grid=grid(4014080), stream=stream0) | |
del primals_84 | |
buf800 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_154.run(primals_248, buf800, 19200, grid=grid(19200), stream=stream0) | |
del primals_248 | |
buf801 = extern_kernels.convolution(as_strided(buf799, (128, 80, 14, 14), (31360, 196, 14, 1)), buf800, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf801, (128, 240, 14, 14), (47040, 196, 14, 1)) | |
buf802 = empty_strided((240, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_154.run(primals_249, buf802, 19200, grid=grid(19200), stream=stream0) | |
del primals_249 | |
buf803 = extern_kernels.convolution(as_strided(buf799, (128, 80, 14, 14), (31360, 196, 14, 1), 15680), buf802, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf803, (128, 240, 14, 14), (47040, 196, 14, 1)) | |
buf806 = buf781; del buf781 # reuse | |
buf804 = as_strided(buf806, (128, 240, 14, 14), (94080, 196, 14, 1)) # alias | |
triton_poi_fused_cat_155.run(buf801, buf804, 6021120, grid=grid(6021120), stream=stream0) | |
buf805 = as_strided(buf806, (128, 240, 14, 14), (94080, 196, 14, 1), 47040) # alias | |
triton_poi_fused_cat_155.run(buf803, buf805, 6021120, grid=grid(6021120), stream=stream0) | |
buf807 = buf765; del buf765 # reuse | |
buf808 = buf807; del buf807 # reuse | |
buf811 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
buf809 = empty_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda', dtype=torch.float32) | |
buf810 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
buf812 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_156.run(buf808, buf806, primals_433, primals_434, buf811, buf809, buf810, buf812, 480, 25088, grid=grid(480), stream=stream0) | |
del primals_433 | |
del primals_434 | |
buf813 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16) | |
buf1130 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_157.run(buf806, buf808, buf809, primals_85, primals_86, buf813, buf1130, 12042240, grid=grid(12042240), stream=stream0) | |
del primals_86 | |
buf814 = empty_strided((120, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_42.run(primals_250, buf814, 1080, grid=grid(1080), stream=stream0) | |
del primals_250 | |
buf815 = buf757; del buf757 # reuse | |
triton_poi_fused_split_with_sizes_158.run(buf813, buf815, 3010560, grid=grid(3010560), stream=stream0) | |
buf816 = extern_kernels.convolution(buf815, buf814, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None) | |
assert_size_stride(buf816, (128, 120, 14, 14), (23520, 196, 14, 1)) | |
buf817 = empty_strided((120, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_159.run(primals_251, buf817, 3000, grid=grid(3000), stream=stream0) | |
del primals_251 | |
buf818 = buf754; del buf754 # reuse | |
triton_poi_fused_split_with_sizes_160.run(buf813, buf818, 3010560, grid=grid(3010560), stream=stream0) | |
buf819 = extern_kernels.convolution(buf818, buf817, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None) | |
assert_size_stride(buf819, (128, 120, 14, 14), (23520, 196, 14, 1)) | |
buf820 = empty_strided((120, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_161.run(primals_252, buf820, 5880, grid=grid(5880), stream=stream0) | |
del primals_252 | |
buf821 = buf751; del buf751 # reuse | |
triton_poi_fused_split_with_sizes_162.run(buf813, buf821, 3010560, grid=grid(3010560), stream=stream0) | |
buf822 = extern_kernels.convolution(buf821, buf820, stride=(1, 1), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None) | |
assert_size_stride(buf822, (128, 120, 14, 14), (23520, 196, 14, 1)) | |
buf823 = empty_strided((120, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_163.run(primals_253, buf823, 9720, grid=grid(9720), stream=stream0) | |
del primals_253 | |
buf824 = buf748; del buf748 # reuse | |
triton_poi_fused_split_with_sizes_164.run(buf813, buf824, 3010560, grid=grid(3010560), stream=stream0) | |
buf825 = extern_kernels.convolution(buf824, buf823, stride=(1, 1), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=120, bias=None) | |
assert_size_stride(buf825, (128, 120, 14, 14), (23520, 196, 14, 1)) | |
buf830 = buf813; del buf813 # reuse | |
buf826 = as_strided(buf830, (128, 120, 14, 14), (94080, 196, 14, 1)) # alias | |
triton_poi_fused_cat_165.run(buf816, buf826, 3010560, grid=grid(3010560), stream=stream0) | |
del buf816 | |
buf827 = as_strided(buf830, (128, 120, 14, 14), (94080, 196, 14, 1), 23520) # alias | |
triton_poi_fused_cat_165.run(buf819, buf827, 3010560, grid=grid(3010560), stream=stream0) | |
del buf819 | |
buf828 = as_strided(buf830, (128, 120, 14, 14), (94080, 196, 14, 1), 47040) # alias | |
triton_poi_fused_cat_165.run(buf822, buf828, 3010560, grid=grid(3010560), stream=stream0) | |
del buf822 | |
buf829 = as_strided(buf830, (128, 120, 14, 14), (94080, 196, 14, 1), 70560) # alias | |
triton_poi_fused_cat_165.run(buf825, buf829, 3010560, grid=grid(3010560), stream=stream0) | |
del buf825 | |
buf831 = buf809; del buf809 # reuse | |
buf832 = buf831; del buf831 # reuse | |
buf835 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
buf833 = empty_strided((1, 480, 1, 1), (480, 1, 480, 480), device='cuda', dtype=torch.float32) | |
buf834 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
buf836 = empty_strided((480, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_156.run(buf832, buf830, primals_436, primals_437, buf835, buf833, buf834, buf836, 480, 25088, grid=grid(480), stream=stream0) | |
del primals_436 | |
del primals_437 | |
buf837 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16) | |
buf839 = empty_strided((128, 480, 1, 1), (480, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_166.run(buf830, buf832, buf833, primals_87, primals_88, buf837, buf839, 61440, 196, grid=grid(61440), stream=stream0) | |
del buf833 | |
del primals_88 | |
buf840 = empty_strided((80, 480, 1, 1), (480, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_167.run(primals_254, buf840, 38400, grid=grid(38400), stream=stream0) | |
del primals_254 | |
buf841 = buf773; del buf773 # reuse | |
triton_poi_fused__to_copy_convolution_168.run(primals_255, buf841, 80, grid=grid(80), stream=stream0) | |
del primals_255 | |
buf842 = extern_kernels.convolution(buf839, buf840, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf842, (128, 80, 1, 1), (80, 1, 1, 1)) | |
buf843 = buf842; del buf842 # reuse | |
buf844 = empty_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_convolution_silu_169.run(buf843, buf841, buf844, 10240, grid=grid(10240), stream=stream0) | |
buf845 = empty_strided((480, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_170.run(primals_256, buf845, 38400, grid=grid(38400), stream=stream0) | |
del primals_256 | |
buf846 = buf778; del buf778 # reuse | |
triton_poi_fused__to_copy_convolution_171.run(primals_257, buf846, 480, grid=grid(480), stream=stream0) | |
del primals_257 | |
buf847 = extern_kernels.convolution(buf844, buf845, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf847, (128, 480, 1, 1), (480, 1, 1, 1)) | |
buf848 = buf847; del buf847 # reuse | |
triton_poi_fused__to_copy_convolution_172.run(buf848, buf846, 61440, grid=grid(61440), stream=stream0) | |
del buf846 | |
buf849 = empty_strided((128, 480, 14, 14), (94080, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_mul_sigmoid_silu_173.run(buf837, buf848, buf849, 12042240, grid=grid(12042240), stream=stream0) | |
buf850 = buf803; del buf803 # reuse | |
triton_poi_fused_split_with_sizes_174.run(buf849, buf850, 6021120, grid=grid(6021120), stream=stream0) | |
buf851 = buf801; del buf801 # reuse | |
triton_poi_fused_split_with_sizes_175.run(buf849, buf851, 6021120, grid=grid(6021120), stream=stream0) | |
del buf849 | |
buf852 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_176.run(primals_258, buf852, 19200, grid=grid(19200), stream=stream0) | |
del primals_258 | |
buf853 = extern_kernels.convolution(buf850, buf852, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf853, (128, 80, 14, 14), (15680, 196, 14, 1)) | |
buf854 = empty_strided((80, 240, 1, 1), (240, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_176.run(primals_259, buf854, 19200, grid=grid(19200), stream=stream0) | |
del primals_259 | |
buf855 = extern_kernels.convolution(buf851, buf854, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf855, (128, 80, 14, 14), (15680, 196, 14, 1)) | |
buf858 = empty_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda', dtype=torch.float16) | |
buf856 = as_strided(buf858, (128, 80, 14, 14), (31360, 196, 14, 1)) # alias | |
triton_poi_fused_cat_177.run(buf853, buf856, 2007040, grid=grid(2007040), stream=stream0) | |
del buf853 | |
buf857 = as_strided(buf858, (128, 80, 14, 14), (31360, 196, 14, 1), 15680) # alias | |
triton_poi_fused_cat_177.run(buf855, buf857, 2007040, grid=grid(2007040), stream=stream0) | |
del buf855 | |
buf859 = buf794; del buf794 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_149.run(buf858, buf859, 640, 6272, grid=grid(640), stream=stream0) | |
buf860 = buf795; del buf795 # reuse | |
buf861 = buf860; del buf860 # reuse | |
buf865 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_150.run(buf861, buf859, primals_439, buf865, 160, 4, grid=grid(160), stream=stream0) | |
del primals_439 | |
buf862 = buf859; del buf859 # reuse | |
triton_red_fused__native_batch_norm_legit_functional_151.run(buf858, buf861, buf862, 640, 6272, grid=grid(640), stream=stream0) | |
buf863 = empty_strided((1, 160, 1, 1), (160, 1, 160, 160), device='cuda', dtype=torch.float32) | |
buf864 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32) | |
buf866 = empty_strided((160, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_per_fused__native_batch_norm_legit_functional_152.run(buf862, primals_440, buf863, buf864, buf866, 160, 4, grid=grid(160), stream=stream0) | |
del buf862 | |
del primals_440 | |
buf867 = empty_strided((128, 160, 14, 14), (31360, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_178.run(buf858, buf861, buf863, primals_89, primals_90, buf799, buf867, 4014080, grid=grid(4014080), stream=stream0) | |
del buf863 | |
del primals_90 | |
buf868 = empty_strided((960, 160, 1, 1), (160, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_179.run(primals_260, buf868, 153600, grid=grid(153600), stream=stream0) | |
del primals_260 | |
buf869 = extern_kernels.convolution(buf867, buf868, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf869, (128, 960, 14, 14), (188160, 196, 14, 1)) | |
buf870 = empty_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda', dtype=torch.float32) | |
buf871 = buf870; del buf870 # reuse | |
buf874 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32) | |
buf872 = empty_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda', dtype=torch.float32) | |
buf873 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32) | |
buf875 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_180.run(buf871, buf869, primals_442, primals_443, buf874, buf872, buf873, buf875, 960, 25088, grid=grid(960), stream=stream0) | |
del primals_442 | |
del primals_443 | |
buf876 = empty_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda', dtype=torch.float16) | |
buf1128 = empty_strided((128, 960, 14, 14), (188160, 196, 14, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_181.run(buf869, buf871, buf872, primals_91, primals_92, buf876, buf1128, 24084480, grid=grid(24084480), stream=stream0) | |
del primals_92 | |
buf877 = empty_strided((240, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_182.run(primals_261, buf877, 2160, grid=grid(2160), stream=stream0) | |
del primals_261 | |
buf878 = as_strided(buf150, (128, 240, 14, 14), (47040, 196, 14, 1)); del buf150 # reuse | |
triton_poi_fused_split_with_sizes_183.run(buf876, buf878, 6021120, grid=grid(6021120), stream=stream0) | |
buf879 = extern_kernels.convolution(buf878, buf877, stride=(2, 2), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=240, bias=None) | |
assert_size_stride(buf879, (128, 240, 7, 7), (11760, 49, 7, 1)) | |
buf880 = empty_strided((240, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_184.run(primals_262, buf880, 6000, grid=grid(6000), stream=stream0) | |
del primals_262 | |
buf881 = as_strided(buf147, (128, 240, 14, 14), (47040, 196, 14, 1)); del buf147 # reuse | |
triton_poi_fused_split_with_sizes_185.run(buf876, buf881, 6021120, grid=grid(6021120), stream=stream0) | |
buf882 = extern_kernels.convolution(buf881, buf880, stride=(2, 2), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=240, bias=None) | |
assert_size_stride(buf882, (128, 240, 7, 7), (11760, 49, 7, 1)) | |
buf883 = empty_strided((240, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_186.run(primals_263, buf883, 11760, grid=grid(11760), stream=stream0) | |
del primals_263 | |
buf884 = as_strided(buf144, (128, 240, 14, 14), (47040, 196, 14, 1)); del buf144 # reuse | |
triton_poi_fused_split_with_sizes_187.run(buf876, buf884, 6021120, grid=grid(6021120), stream=stream0) | |
buf885 = extern_kernels.convolution(buf884, buf883, stride=(2, 2), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=240, bias=None) | |
assert_size_stride(buf885, (128, 240, 7, 7), (11760, 49, 7, 1)) | |
buf886 = empty_strided((240, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_188.run(primals_264, buf886, 19440, grid=grid(19440), stream=stream0) | |
del primals_264 | |
buf887 = as_strided(buf141, (128, 240, 14, 14), (47040, 196, 14, 1)); del buf141 # reuse | |
triton_poi_fused_split_with_sizes_189.run(buf876, buf887, 6021120, grid=grid(6021120), stream=stream0) | |
del buf876 | |
buf888 = extern_kernels.convolution(buf887, buf886, stride=(2, 2), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=240, bias=None) | |
assert_size_stride(buf888, (128, 240, 7, 7), (11760, 49, 7, 1)) | |
buf893 = empty_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda', dtype=torch.float16) | |
buf889 = as_strided(buf893, (128, 240, 7, 7), (47040, 49, 7, 1)) # alias | |
triton_poi_fused_cat_190.run(buf879, buf889, 1505280, grid=grid(1505280), stream=stream0) | |
del buf879 | |
buf890 = as_strided(buf893, (128, 240, 7, 7), (47040, 49, 7, 1), 11760) # alias | |
triton_poi_fused_cat_190.run(buf882, buf890, 1505280, grid=grid(1505280), stream=stream0) | |
del buf882 | |
buf891 = as_strided(buf893, (128, 240, 7, 7), (47040, 49, 7, 1), 23520) # alias | |
triton_poi_fused_cat_190.run(buf885, buf891, 1505280, grid=grid(1505280), stream=stream0) | |
del buf885 | |
buf892 = as_strided(buf893, (128, 240, 7, 7), (47040, 49, 7, 1), 35280) # alias | |
triton_poi_fused_cat_190.run(buf888, buf892, 1505280, grid=grid(1505280), stream=stream0) | |
del buf888 | |
buf894 = buf872; del buf872 # reuse | |
buf895 = buf894; del buf894 # reuse | |
buf898 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32) | |
buf896 = empty_strided((1, 960, 1, 1), (960, 1, 960, 960), device='cuda', dtype=torch.float32) | |
buf897 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32) | |
buf899 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_191.run(buf895, buf893, primals_445, primals_446, buf898, buf896, buf897, buf899, 960, 6272, grid=grid(960), stream=stream0) | |
del primals_445 | |
del primals_446 | |
buf900 = empty_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda', dtype=torch.float16) | |
buf902 = empty_strided((128, 960, 1, 1), (960, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_192.run(buf893, buf895, buf896, primals_93, primals_94, buf900, buf902, 122880, 49, grid=grid(122880), stream=stream0) | |
del buf896 | |
del primals_94 | |
buf903 = empty_strided((80, 960, 1, 1), (960, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_193.run(primals_265, buf903, 76800, grid=grid(76800), stream=stream0) | |
del primals_265 | |
buf904 = buf841; del buf841 # reuse | |
triton_poi_fused__to_copy_convolution_168.run(primals_266, buf904, 80, grid=grid(80), stream=stream0) | |
del primals_266 | |
buf905 = extern_kernels.convolution(buf902, buf903, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf905, (128, 80, 1, 1), (80, 1, 1, 1)) | |
buf906 = buf905; del buf905 # reuse | |
buf907 = empty_strided((128, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_convolution_silu_169.run(buf906, buf904, buf907, 10240, grid=grid(10240), stream=stream0) | |
del buf904 | |
buf908 = empty_strided((960, 80, 1, 1), (80, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_194.run(primals_267, buf908, 76800, grid=grid(76800), stream=stream0) | |
del primals_267 | |
buf909 = empty_strided((960, ), (1, ), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_convolution_195.run(primals_268, buf909, 960, grid=grid(960), stream=stream0) | |
del primals_268 | |
buf910 = extern_kernels.convolution(buf907, buf908, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf910, (128, 960, 1, 1), (960, 1, 1, 1)) | |
buf911 = buf910; del buf910 # reuse | |
triton_poi_fused__to_copy_convolution_196.run(buf911, buf909, 122880, grid=grid(122880), stream=stream0) | |
del buf909 | |
buf912 = empty_strided((128, 960, 7, 7), (47040, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_mul_sigmoid_silu_197.run(buf900, buf911, buf912, 6021120, grid=grid(6021120), stream=stream0) | |
buf913 = empty_strided((264, 960, 1, 1), (960, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_198.run(primals_269, buf913, 253440, grid=grid(253440), stream=stream0) | |
del primals_269 | |
buf914 = extern_kernels.convolution(buf912, buf913, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf914, (128, 264, 7, 7), (12936, 49, 7, 1)) | |
buf915 = empty_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda', dtype=torch.float32) | |
buf916 = buf915; del buf915 # reuse | |
buf919 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32) | |
buf917 = empty_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda', dtype=torch.float32) | |
buf918 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32) | |
buf920 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_199.run(buf916, buf914, primals_448, primals_449, buf919, buf917, buf918, buf920, 264, 6272, grid=grid(264), stream=stream0) | |
del primals_448 | |
del primals_449 | |
buf921 = empty_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_200.run(buf914, buf916, buf917, primals_95, primals_96, buf921, 1655808, grid=grid(1655808), stream=stream0) | |
del primals_96 | |
buf922 = empty_strided((1584, 264, 1, 1), (264, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_201.run(primals_270, buf922, 418176, grid=grid(418176), stream=stream0) | |
del primals_270 | |
buf923 = extern_kernels.convolution(buf921, buf922, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf923, (128, 1584, 7, 7), (77616, 49, 7, 1)) | |
buf924 = empty_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda', dtype=torch.float32) | |
buf925 = buf924; del buf924 # reuse | |
buf928 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
buf926 = empty_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda', dtype=torch.float32) | |
buf927 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
buf929 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_202.run(buf925, buf923, primals_451, primals_452, buf928, buf926, buf927, buf929, 1584, 6272, grid=grid(1584), stream=stream0) | |
del primals_451 | |
del primals_452 | |
buf930 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16) | |
buf1126 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_203.run(buf923, buf925, buf926, primals_97, primals_98, buf930, buf1126, 9934848, grid=grid(9934848), stream=stream0) | |
del primals_98 | |
buf931 = empty_strided((396, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_204.run(primals_271, buf931, 3564, grid=grid(3564), stream=stream0) | |
del primals_271 | |
buf932 = empty_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_split_with_sizes_205.run(buf930, buf932, 2483712, grid=grid(2483712), stream=stream0) | |
buf933 = extern_kernels.convolution(buf932, buf931, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None) | |
assert_size_stride(buf933, (128, 396, 7, 7), (19404, 49, 7, 1)) | |
buf934 = empty_strided((396, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_206.run(primals_272, buf934, 9900, grid=grid(9900), stream=stream0) | |
del primals_272 | |
buf935 = empty_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_split_with_sizes_207.run(buf930, buf935, 2483712, grid=grid(2483712), stream=stream0) | |
buf936 = extern_kernels.convolution(buf935, buf934, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None) | |
assert_size_stride(buf936, (128, 396, 7, 7), (19404, 49, 7, 1)) | |
buf937 = empty_strided((396, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_208.run(primals_273, buf937, 19404, grid=grid(19404), stream=stream0) | |
del primals_273 | |
buf938 = empty_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_split_with_sizes_209.run(buf930, buf938, 2483712, grid=grid(2483712), stream=stream0) | |
buf939 = extern_kernels.convolution(buf938, buf937, stride=(1, 1), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None) | |
assert_size_stride(buf939, (128, 396, 7, 7), (19404, 49, 7, 1)) | |
buf940 = empty_strided((396, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_210.run(primals_274, buf940, 32076, grid=grid(32076), stream=stream0) | |
del primals_274 | |
buf941 = empty_strided((128, 396, 7, 7), (19404, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_split_with_sizes_211.run(buf930, buf941, 2483712, grid=grid(2483712), stream=stream0) | |
buf942 = extern_kernels.convolution(buf941, buf940, stride=(1, 1), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None) | |
assert_size_stride(buf942, (128, 396, 7, 7), (19404, 49, 7, 1)) | |
buf947 = buf930; del buf930 # reuse | |
buf943 = as_strided(buf947, (128, 396, 7, 7), (77616, 49, 7, 1)) # alias | |
triton_poi_fused_cat_212.run(buf933, buf943, 2483712, grid=grid(2483712), stream=stream0) | |
buf944 = as_strided(buf947, (128, 396, 7, 7), (77616, 49, 7, 1), 19404) # alias | |
triton_poi_fused_cat_213.run(buf936, buf944, 2483712, grid=grid(2483712), stream=stream0) | |
buf945 = as_strided(buf947, (128, 396, 7, 7), (77616, 49, 7, 1), 38808) # alias | |
triton_poi_fused_cat_213.run(buf939, buf945, 2483712, grid=grid(2483712), stream=stream0) | |
buf946 = as_strided(buf947, (128, 396, 7, 7), (77616, 49, 7, 1), 58212) # alias | |
triton_poi_fused_cat_213.run(buf942, buf946, 2483712, grid=grid(2483712), stream=stream0) | |
buf948 = buf926; del buf926 # reuse | |
buf949 = buf948; del buf948 # reuse | |
buf952 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
buf950 = empty_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda', dtype=torch.float32) | |
buf951 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
buf953 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_202.run(buf949, buf947, primals_454, primals_455, buf952, buf950, buf951, buf953, 1584, 6272, grid=grid(1584), stream=stream0) | |
del primals_454 | |
del primals_455 | |
buf954 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16) | |
buf956 = empty_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_214.run(buf947, buf949, buf950, primals_99, primals_100, buf954, buf956, 202752, 49, grid=grid(202752), stream=stream0) | |
del primals_100 | |
buf957 = empty_strided((132, 1584, 1, 1), (1584, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_215.run(primals_275, buf957, 209088, grid=grid(209088), stream=stream0) | |
del primals_275 | |
buf958 = empty_strided((132, ), (1, ), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_convolution_216.run(primals_276, buf958, 132, grid=grid(132), stream=stream0) | |
del primals_276 | |
buf959 = extern_kernels.convolution(buf956, buf957, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf959, (128, 132, 1, 1), (132, 1, 1, 1)) | |
buf960 = buf959; del buf959 # reuse | |
buf961 = empty_strided((128, 132, 1, 1), (132, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_convolution_silu_217.run(buf960, buf958, buf961, 16896, grid=grid(16896), stream=stream0) | |
buf962 = empty_strided((1584, 132, 1, 1), (132, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_218.run(primals_277, buf962, 209088, grid=grid(209088), stream=stream0) | |
del primals_277 | |
buf963 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_convolution_219.run(primals_278, buf963, 1584, grid=grid(1584), stream=stream0) | |
del primals_278 | |
buf964 = extern_kernels.convolution(buf961, buf962, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf964, (128, 1584, 1, 1), (1584, 1, 1, 1)) | |
buf965 = buf964; del buf964 # reuse | |
triton_poi_fused__to_copy_convolution_220.run(buf965, buf963, 202752, grid=grid(202752), stream=stream0) | |
buf966 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_mul_sigmoid_silu_221.run(buf954, buf965, buf966, 9934848, grid=grid(9934848), stream=stream0) | |
buf967 = empty_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_split_with_sizes_222.run(buf966, buf967, 4967424, grid=grid(4967424), stream=stream0) | |
buf968 = empty_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_split_with_sizes_223.run(buf966, buf968, 4967424, grid=grid(4967424), stream=stream0) | |
buf969 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_224.run(primals_279, buf969, 104544, grid=grid(104544), stream=stream0) | |
del primals_279 | |
buf970 = extern_kernels.convolution(buf967, buf969, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf970, (128, 132, 7, 7), (6468, 49, 7, 1)) | |
buf971 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_224.run(primals_280, buf971, 104544, grid=grid(104544), stream=stream0) | |
del primals_280 | |
buf972 = extern_kernels.convolution(buf968, buf971, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf972, (128, 132, 7, 7), (6468, 49, 7, 1)) | |
buf975 = empty_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda', dtype=torch.float16) | |
buf973 = as_strided(buf975, (128, 132, 7, 7), (12936, 49, 7, 1)) # alias | |
triton_poi_fused_cat_225.run(buf970, buf973, 827904, grid=grid(827904), stream=stream0) | |
del buf970 | |
buf974 = as_strided(buf975, (128, 132, 7, 7), (12936, 49, 7, 1), 6468) # alias | |
triton_poi_fused_cat_226.run(buf972, buf974, 827904, grid=grid(827904), stream=stream0) | |
del buf972 | |
buf976 = buf917; del buf917 # reuse | |
buf977 = buf976; del buf976 # reuse | |
buf980 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32) | |
buf978 = empty_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda', dtype=torch.float32) | |
buf979 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32) | |
buf981 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_199.run(buf977, buf975, primals_457, primals_458, buf980, buf978, buf979, buf981, 264, 6272, grid=grid(264), stream=stream0) | |
del primals_457 | |
del primals_458 | |
buf982 = empty_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_227.run(buf975, buf977, buf978, primals_101, primals_102, buf921, buf982, 1655808, grid=grid(1655808), stream=stream0) | |
del primals_102 | |
buf983 = empty_strided((1584, 264, 1, 1), (264, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_201.run(primals_281, buf983, 418176, grid=grid(418176), stream=stream0) | |
del primals_281 | |
buf984 = extern_kernels.convolution(buf982, buf983, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf984, (128, 1584, 7, 7), (77616, 49, 7, 1)) | |
buf985 = buf950; del buf950 # reuse | |
buf986 = buf985; del buf985 # reuse | |
buf989 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
buf987 = empty_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda', dtype=torch.float32) | |
buf988 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
buf990 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_202.run(buf986, buf984, primals_460, primals_461, buf989, buf987, buf988, buf990, 1584, 6272, grid=grid(1584), stream=stream0) | |
del primals_460 | |
del primals_461 | |
buf991 = buf966; del buf966 # reuse | |
buf1124 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_203.run(buf984, buf986, buf987, primals_103, primals_104, buf991, buf1124, 9934848, grid=grid(9934848), stream=stream0) | |
del primals_104 | |
buf992 = empty_strided((396, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_204.run(primals_282, buf992, 3564, grid=grid(3564), stream=stream0) | |
del primals_282 | |
buf993 = buf942; del buf942 # reuse | |
triton_poi_fused_split_with_sizes_205.run(buf991, buf993, 2483712, grid=grid(2483712), stream=stream0) | |
buf994 = extern_kernels.convolution(buf993, buf992, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None) | |
assert_size_stride(buf994, (128, 396, 7, 7), (19404, 49, 7, 1)) | |
buf995 = empty_strided((396, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_206.run(primals_283, buf995, 9900, grid=grid(9900), stream=stream0) | |
del primals_283 | |
buf996 = buf939; del buf939 # reuse | |
triton_poi_fused_split_with_sizes_207.run(buf991, buf996, 2483712, grid=grid(2483712), stream=stream0) | |
buf997 = extern_kernels.convolution(buf996, buf995, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None) | |
assert_size_stride(buf997, (128, 396, 7, 7), (19404, 49, 7, 1)) | |
buf998 = empty_strided((396, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_208.run(primals_284, buf998, 19404, grid=grid(19404), stream=stream0) | |
del primals_284 | |
buf999 = buf936; del buf936 # reuse | |
triton_poi_fused_split_with_sizes_209.run(buf991, buf999, 2483712, grid=grid(2483712), stream=stream0) | |
buf1000 = extern_kernels.convolution(buf999, buf998, stride=(1, 1), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None) | |
assert_size_stride(buf1000, (128, 396, 7, 7), (19404, 49, 7, 1)) | |
buf1001 = empty_strided((396, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_210.run(primals_285, buf1001, 32076, grid=grid(32076), stream=stream0) | |
del primals_285 | |
buf1002 = buf933; del buf933 # reuse | |
triton_poi_fused_split_with_sizes_211.run(buf991, buf1002, 2483712, grid=grid(2483712), stream=stream0) | |
buf1003 = extern_kernels.convolution(buf1002, buf1001, stride=(1, 1), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None) | |
assert_size_stride(buf1003, (128, 396, 7, 7), (19404, 49, 7, 1)) | |
buf1008 = buf991; del buf991 # reuse | |
buf1004 = as_strided(buf1008, (128, 396, 7, 7), (77616, 49, 7, 1)) # alias | |
triton_poi_fused_cat_212.run(buf994, buf1004, 2483712, grid=grid(2483712), stream=stream0) | |
buf1005 = as_strided(buf1008, (128, 396, 7, 7), (77616, 49, 7, 1), 19404) # alias | |
triton_poi_fused_cat_213.run(buf997, buf1005, 2483712, grid=grid(2483712), stream=stream0) | |
buf1006 = as_strided(buf1008, (128, 396, 7, 7), (77616, 49, 7, 1), 38808) # alias | |
triton_poi_fused_cat_213.run(buf1000, buf1006, 2483712, grid=grid(2483712), stream=stream0) | |
buf1007 = as_strided(buf1008, (128, 396, 7, 7), (77616, 49, 7, 1), 58212) # alias | |
triton_poi_fused_cat_213.run(buf1003, buf1007, 2483712, grid=grid(2483712), stream=stream0) | |
buf1009 = buf987; del buf987 # reuse | |
buf1010 = buf1009; del buf1009 # reuse | |
buf1013 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
buf1011 = empty_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda', dtype=torch.float32) | |
buf1012 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
buf1014 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_202.run(buf1010, buf1008, primals_463, primals_464, buf1013, buf1011, buf1012, buf1014, 1584, 6272, grid=grid(1584), stream=stream0) | |
del primals_463 | |
del primals_464 | |
buf1015 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16) | |
buf1017 = empty_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_214.run(buf1008, buf1010, buf1011, primals_105, primals_106, buf1015, buf1017, 202752, 49, grid=grid(202752), stream=stream0) | |
del primals_106 | |
buf1018 = empty_strided((132, 1584, 1, 1), (1584, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_215.run(primals_286, buf1018, 209088, grid=grid(209088), stream=stream0) | |
del primals_286 | |
buf1019 = buf958; del buf958 # reuse | |
triton_poi_fused__to_copy_convolution_216.run(primals_287, buf1019, 132, grid=grid(132), stream=stream0) | |
del primals_287 | |
buf1020 = extern_kernels.convolution(buf1017, buf1018, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf1020, (128, 132, 1, 1), (132, 1, 1, 1)) | |
buf1021 = buf1020; del buf1020 # reuse | |
buf1022 = empty_strided((128, 132, 1, 1), (132, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_convolution_silu_217.run(buf1021, buf1019, buf1022, 16896, grid=grid(16896), stream=stream0) | |
buf1023 = empty_strided((1584, 132, 1, 1), (132, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_218.run(primals_288, buf1023, 209088, grid=grid(209088), stream=stream0) | |
del primals_288 | |
buf1024 = buf963; del buf963 # reuse | |
triton_poi_fused__to_copy_convolution_219.run(primals_289, buf1024, 1584, grid=grid(1584), stream=stream0) | |
del primals_289 | |
buf1025 = extern_kernels.convolution(buf1022, buf1023, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf1025, (128, 1584, 1, 1), (1584, 1, 1, 1)) | |
buf1026 = buf1025; del buf1025 # reuse | |
triton_poi_fused__to_copy_convolution_220.run(buf1026, buf1024, 202752, grid=grid(202752), stream=stream0) | |
buf1027 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_mul_sigmoid_silu_221.run(buf1015, buf1026, buf1027, 9934848, grid=grid(9934848), stream=stream0) | |
buf1028 = empty_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_split_with_sizes_222.run(buf1027, buf1028, 4967424, grid=grid(4967424), stream=stream0) | |
buf1029 = empty_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_split_with_sizes_223.run(buf1027, buf1029, 4967424, grid=grid(4967424), stream=stream0) | |
buf1030 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_224.run(primals_290, buf1030, 104544, grid=grid(104544), stream=stream0) | |
del primals_290 | |
buf1031 = extern_kernels.convolution(buf1028, buf1030, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf1031, (128, 132, 7, 7), (6468, 49, 7, 1)) | |
buf1032 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_224.run(primals_291, buf1032, 104544, grid=grid(104544), stream=stream0) | |
del primals_291 | |
buf1033 = extern_kernels.convolution(buf1029, buf1032, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf1033, (128, 132, 7, 7), (6468, 49, 7, 1)) | |
buf1036 = empty_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda', dtype=torch.float16) | |
buf1034 = as_strided(buf1036, (128, 132, 7, 7), (12936, 49, 7, 1)) # alias | |
triton_poi_fused_cat_225.run(buf1031, buf1034, 827904, grid=grid(827904), stream=stream0) | |
del buf1031 | |
buf1035 = as_strided(buf1036, (128, 132, 7, 7), (12936, 49, 7, 1), 6468) # alias | |
triton_poi_fused_cat_226.run(buf1033, buf1035, 827904, grid=grid(827904), stream=stream0) | |
del buf1033 | |
buf1037 = buf978; del buf978 # reuse | |
buf1038 = buf1037; del buf1037 # reuse | |
buf1041 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32) | |
buf1039 = empty_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda', dtype=torch.float32) | |
buf1040 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32) | |
buf1042 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_199.run(buf1038, buf1036, primals_466, primals_467, buf1041, buf1039, buf1040, buf1042, 264, 6272, grid=grid(264), stream=stream0) | |
del primals_466 | |
del primals_467 | |
buf1043 = empty_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_227.run(buf1036, buf1038, buf1039, primals_107, primals_108, buf982, buf1043, 1655808, grid=grid(1655808), stream=stream0) | |
del primals_108 | |
buf1044 = empty_strided((1584, 264, 1, 1), (264, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_201.run(primals_292, buf1044, 418176, grid=grid(418176), stream=stream0) | |
del primals_292 | |
buf1045 = extern_kernels.convolution(buf1043, buf1044, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf1045, (128, 1584, 7, 7), (77616, 49, 7, 1)) | |
buf1046 = buf1011; del buf1011 # reuse | |
buf1047 = buf1046; del buf1046 # reuse | |
buf1050 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
buf1048 = empty_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda', dtype=torch.float32) | |
buf1049 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
buf1051 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_202.run(buf1047, buf1045, primals_469, primals_470, buf1050, buf1048, buf1049, buf1051, 1584, 6272, grid=grid(1584), stream=stream0) | |
del primals_469 | |
del primals_470 | |
buf1052 = buf1027; del buf1027 # reuse | |
buf1122 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_clone_fill_mul_sigmoid_sub_203.run(buf1045, buf1047, buf1048, primals_109, primals_110, buf1052, buf1122, 9934848, grid=grid(9934848), stream=stream0) | |
del primals_110 | |
buf1053 = empty_strided((396, 1, 3, 3), (9, 9, 3, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_204.run(primals_293, buf1053, 3564, grid=grid(3564), stream=stream0) | |
del primals_293 | |
buf1054 = buf1003; del buf1003 # reuse | |
triton_poi_fused_split_with_sizes_205.run(buf1052, buf1054, 2483712, grid=grid(2483712), stream=stream0) | |
buf1055 = extern_kernels.convolution(buf1054, buf1053, stride=(1, 1), padding=(1, 1), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None) | |
assert_size_stride(buf1055, (128, 396, 7, 7), (19404, 49, 7, 1)) | |
buf1056 = empty_strided((396, 1, 5, 5), (25, 25, 5, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_206.run(primals_294, buf1056, 9900, grid=grid(9900), stream=stream0) | |
del primals_294 | |
buf1057 = buf1000; del buf1000 # reuse | |
triton_poi_fused_split_with_sizes_207.run(buf1052, buf1057, 2483712, grid=grid(2483712), stream=stream0) | |
buf1058 = extern_kernels.convolution(buf1057, buf1056, stride=(1, 1), padding=(2, 2), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None) | |
assert_size_stride(buf1058, (128, 396, 7, 7), (19404, 49, 7, 1)) | |
buf1059 = empty_strided((396, 1, 7, 7), (49, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_208.run(primals_295, buf1059, 19404, grid=grid(19404), stream=stream0) | |
del primals_295 | |
buf1060 = buf997; del buf997 # reuse | |
triton_poi_fused_split_with_sizes_209.run(buf1052, buf1060, 2483712, grid=grid(2483712), stream=stream0) | |
buf1061 = extern_kernels.convolution(buf1060, buf1059, stride=(1, 1), padding=(3, 3), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None) | |
assert_size_stride(buf1061, (128, 396, 7, 7), (19404, 49, 7, 1)) | |
buf1062 = empty_strided((396, 1, 9, 9), (81, 81, 9, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_210.run(primals_296, buf1062, 32076, grid=grid(32076), stream=stream0) | |
del primals_296 | |
buf1063 = buf994; del buf994 # reuse | |
triton_poi_fused_split_with_sizes_211.run(buf1052, buf1063, 2483712, grid=grid(2483712), stream=stream0) | |
buf1064 = extern_kernels.convolution(buf1063, buf1062, stride=(1, 1), padding=(4, 4), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=396, bias=None) | |
assert_size_stride(buf1064, (128, 396, 7, 7), (19404, 49, 7, 1)) | |
buf1069 = buf1052; del buf1052 # reuse | |
buf1065 = as_strided(buf1069, (128, 396, 7, 7), (77616, 49, 7, 1)) # alias | |
triton_poi_fused_cat_212.run(buf1055, buf1065, 2483712, grid=grid(2483712), stream=stream0) | |
del buf1055 | |
buf1066 = as_strided(buf1069, (128, 396, 7, 7), (77616, 49, 7, 1), 19404) # alias | |
triton_poi_fused_cat_213.run(buf1058, buf1066, 2483712, grid=grid(2483712), stream=stream0) | |
del buf1058 | |
buf1067 = as_strided(buf1069, (128, 396, 7, 7), (77616, 49, 7, 1), 38808) # alias | |
triton_poi_fused_cat_213.run(buf1061, buf1067, 2483712, grid=grid(2483712), stream=stream0) | |
del buf1061 | |
buf1068 = as_strided(buf1069, (128, 396, 7, 7), (77616, 49, 7, 1), 58212) # alias | |
triton_poi_fused_cat_213.run(buf1064, buf1068, 2483712, grid=grid(2483712), stream=stream0) | |
del buf1064 | |
buf1070 = buf1048; del buf1048 # reuse | |
buf1071 = buf1070; del buf1070 # reuse | |
buf1074 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
buf1072 = empty_strided((1, 1584, 1, 1), (1584, 1, 1584, 1584), device='cuda', dtype=torch.float32) | |
buf1073 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
buf1075 = empty_strided((1584, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_202.run(buf1071, buf1069, primals_472, primals_473, buf1074, buf1072, buf1073, buf1075, 1584, 6272, grid=grid(1584), stream=stream0) | |
del primals_472 | |
del primals_473 | |
buf1076 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16) | |
buf1078 = empty_strided((128, 1584, 1, 1), (1584, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_per_fused__native_batch_norm_legit_functional_mean_silu_214.run(buf1069, buf1071, buf1072, primals_111, primals_112, buf1076, buf1078, 202752, 49, grid=grid(202752), stream=stream0) | |
del buf1072 | |
del primals_112 | |
buf1079 = empty_strided((132, 1584, 1, 1), (1584, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_215.run(primals_297, buf1079, 209088, grid=grid(209088), stream=stream0) | |
del primals_297 | |
buf1080 = buf1019; del buf1019 # reuse | |
triton_poi_fused__to_copy_convolution_216.run(primals_298, buf1080, 132, grid=grid(132), stream=stream0) | |
del primals_298 | |
buf1081 = extern_kernels.convolution(buf1078, buf1079, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf1081, (128, 132, 1, 1), (132, 1, 1, 1)) | |
buf1082 = buf1081; del buf1081 # reuse | |
buf1083 = empty_strided((128, 132, 1, 1), (132, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_convolution_silu_217.run(buf1082, buf1080, buf1083, 16896, grid=grid(16896), stream=stream0) | |
del buf1080 | |
buf1084 = empty_strided((1584, 132, 1, 1), (132, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_218.run(primals_299, buf1084, 209088, grid=grid(209088), stream=stream0) | |
del primals_299 | |
buf1085 = buf1024; del buf1024 # reuse | |
triton_poi_fused__to_copy_convolution_219.run(primals_300, buf1085, 1584, grid=grid(1584), stream=stream0) | |
del primals_300 | |
buf1086 = extern_kernels.convolution(buf1083, buf1084, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf1086, (128, 1584, 1, 1), (1584, 1, 1, 1)) | |
buf1087 = buf1086; del buf1086 # reuse | |
triton_poi_fused__to_copy_convolution_220.run(buf1087, buf1085, 202752, grid=grid(202752), stream=stream0) | |
del buf1085 | |
buf1088 = empty_strided((128, 1584, 7, 7), (77616, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_mul_sigmoid_silu_221.run(buf1076, buf1087, buf1088, 9934848, grid=grid(9934848), stream=stream0) | |
buf1089 = empty_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_split_with_sizes_222.run(buf1088, buf1089, 4967424, grid=grid(4967424), stream=stream0) | |
buf1090 = empty_strided((128, 792, 7, 7), (38808, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused_split_with_sizes_223.run(buf1088, buf1090, 4967424, grid=grid(4967424), stream=stream0) | |
del buf1088 | |
buf1091 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_224.run(primals_301, buf1091, 104544, grid=grid(104544), stream=stream0) | |
del primals_301 | |
buf1092 = extern_kernels.convolution(buf1089, buf1091, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf1092, (128, 132, 7, 7), (6468, 49, 7, 1)) | |
buf1093 = empty_strided((132, 792, 1, 1), (792, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_224.run(primals_302, buf1093, 104544, grid=grid(104544), stream=stream0) | |
del primals_302 | |
buf1094 = extern_kernels.convolution(buf1090, buf1093, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf1094, (128, 132, 7, 7), (6468, 49, 7, 1)) | |
buf1097 = empty_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda', dtype=torch.float16) | |
buf1095 = as_strided(buf1097, (128, 132, 7, 7), (12936, 49, 7, 1)) # alias | |
triton_poi_fused_cat_225.run(buf1092, buf1095, 827904, grid=grid(827904), stream=stream0) | |
del buf1092 | |
buf1096 = as_strided(buf1097, (128, 132, 7, 7), (12936, 49, 7, 1), 6468) # alias | |
triton_poi_fused_cat_226.run(buf1094, buf1096, 827904, grid=grid(827904), stream=stream0) | |
del buf1094 | |
buf1098 = buf1039; del buf1039 # reuse | |
buf1099 = buf1098; del buf1098 # reuse | |
buf1102 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32) | |
buf1100 = empty_strided((1, 264, 1, 1), (264, 1, 264, 264), device='cuda', dtype=torch.float32) | |
buf1101 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32) | |
buf1103 = empty_strided((264, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_199.run(buf1099, buf1097, primals_475, primals_476, buf1102, buf1100, buf1101, buf1103, 264, 6272, grid=grid(264), stream=stream0) | |
del primals_475 | |
del primals_476 | |
buf1104 = empty_strided((128, 264, 7, 7), (12936, 49, 7, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__native_batch_norm_legit_functional_add_227.run(buf1097, buf1099, buf1100, primals_113, primals_114, buf1043, buf1104, 1655808, grid=grid(1655808), stream=stream0) | |
del buf1100 | |
del primals_114 | |
buf1105 = empty_strided((1536, 264, 1, 1), (264, 1, 1, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_228.run(primals_303, buf1105, 405504, grid=grid(405504), stream=stream0) | |
del primals_303 | |
buf1106 = extern_kernels.convolution(buf1104, buf1105, stride=(1, 1), padding=(0, 0), dilation=(1, 1), transposed=False, output_padding=(0, 0), groups=1, bias=None) | |
assert_size_stride(buf1106, (128, 1536, 7, 7), (75264, 49, 7, 1)) | |
buf1107 = empty_strided((1, 1536, 1, 1), (1536, 1, 1536, 1536), device='cuda', dtype=torch.float32) | |
buf1108 = buf1107; del buf1107 # reuse | |
buf1111 = empty_strided((1536, ), (1, ), device='cuda', dtype=torch.float32) | |
buf1109 = empty_strided((1, 1536, 1, 1), (1536, 1, 1536, 1536), device='cuda', dtype=torch.float32) | |
buf1110 = empty_strided((1536, ), (1, ), device='cuda', dtype=torch.float32) | |
buf1112 = empty_strided((1536, ), (1, ), device='cuda', dtype=torch.float32) | |
triton_red_fused__native_batch_norm_legit_functional_229.run(buf1108, buf1106, primals_478, primals_479, buf1111, buf1109, buf1110, buf1112, 1536, 6272, grid=grid(1536), stream=stream0) | |
del primals_478 | |
del primals_479 | |
buf1120 = empty_strided((128, 1536, 7, 7), (75264, 49, 7, 1), device='cuda', dtype=torch.bool) | |
buf1115 = empty_strided((128, 1536), (1536, 1), device='cuda', dtype=torch.float16) | |
triton_per_fused__native_batch_norm_legit_functional_mean_relu_threshold_backward_view_230.run(buf1106, buf1108, buf1109, primals_115, primals_116, buf1120, buf1115, 196608, 49, grid=grid(196608), stream=stream0) | |
del buf1109 | |
del primals_116 | |
buf1116 = empty_strided((1000, 1536), (1536, 1), device='cuda', dtype=torch.float16) | |
buf1119 = empty_strided((1000, 1536), (1536, 1), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_t_231.run(primals_304, buf1116, buf1119, 1536000, grid=grid(1536000), stream=stream0) | |
del primals_304 | |
buf1117 = empty_strided((1000, ), (1, ), device='cuda', dtype=torch.float16) | |
triton_poi_fused__to_copy_232.run(primals_305, buf1117, 1000, grid=grid(1000), stream=stream0) | |
del primals_305 | |
buf1118 = empty_strided((128, 1000), (1000, 1), device='cuda', dtype=torch.float16) | |
extern_kernels.addmm(buf1117, buf1115, as_strided(buf1116, (1536, 1000), (1, 1536)), alpha=1, beta=1, out=buf1118) | |
del buf1116 | |
del buf1117 | |
buf1156 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_306, buf1156, 1, grid=grid(1), stream=stream0) | |
del primals_306 | |
buf1157 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_309, buf1157, 1, grid=grid(1), stream=stream0) | |
del primals_309 | |
buf1158 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_312, buf1158, 1, grid=grid(1), stream=stream0) | |
del primals_312 | |
buf1159 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_315, buf1159, 1, grid=grid(1), stream=stream0) | |
del primals_315 | |
buf1160 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_318, buf1160, 1, grid=grid(1), stream=stream0) | |
del primals_318 | |
buf1161 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_321, buf1161, 1, grid=grid(1), stream=stream0) | |
del primals_321 | |
buf1162 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_324, buf1162, 1, grid=grid(1), stream=stream0) | |
del primals_324 | |
buf1163 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_327, buf1163, 1, grid=grid(1), stream=stream0) | |
del primals_327 | |
buf1164 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_330, buf1164, 1, grid=grid(1), stream=stream0) | |
del primals_330 | |
buf1165 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_333, buf1165, 1, grid=grid(1), stream=stream0) | |
del primals_333 | |
buf1166 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_336, buf1166, 1, grid=grid(1), stream=stream0) | |
del primals_336 | |
buf1167 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_339, buf1167, 1, grid=grid(1), stream=stream0) | |
del primals_339 | |
buf1168 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_342, buf1168, 1, grid=grid(1), stream=stream0) | |
del primals_342 | |
buf1169 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_345, buf1169, 1, grid=grid(1), stream=stream0) | |
del primals_345 | |
buf1170 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_348, buf1170, 1, grid=grid(1), stream=stream0) | |
del primals_348 | |
buf1171 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_351, buf1171, 1, grid=grid(1), stream=stream0) | |
del primals_351 | |
buf1172 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_354, buf1172, 1, grid=grid(1), stream=stream0) | |
del primals_354 | |
buf1173 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_357, buf1173, 1, grid=grid(1), stream=stream0) | |
del primals_357 | |
buf1174 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_360, buf1174, 1, grid=grid(1), stream=stream0) | |
del primals_360 | |
buf1175 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_363, buf1175, 1, grid=grid(1), stream=stream0) | |
del primals_363 | |
buf1176 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_366, buf1176, 1, grid=grid(1), stream=stream0) | |
del primals_366 | |
buf1177 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_369, buf1177, 1, grid=grid(1), stream=stream0) | |
del primals_369 | |
buf1178 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_372, buf1178, 1, grid=grid(1), stream=stream0) | |
del primals_372 | |
buf1179 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_375, buf1179, 1, grid=grid(1), stream=stream0) | |
del primals_375 | |
buf1180 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_378, buf1180, 1, grid=grid(1), stream=stream0) | |
del primals_378 | |
buf1181 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_381, buf1181, 1, grid=grid(1), stream=stream0) | |
del primals_381 | |
buf1182 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_384, buf1182, 1, grid=grid(1), stream=stream0) | |
del primals_384 | |
buf1183 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_387, buf1183, 1, grid=grid(1), stream=stream0) | |
del primals_387 | |
buf1184 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_390, buf1184, 1, grid=grid(1), stream=stream0) | |
del primals_390 | |
buf1185 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_393, buf1185, 1, grid=grid(1), stream=stream0) | |
del primals_393 | |
buf1186 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_396, buf1186, 1, grid=grid(1), stream=stream0) | |
del primals_396 | |
buf1187 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_399, buf1187, 1, grid=grid(1), stream=stream0) | |
del primals_399 | |
buf1188 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_402, buf1188, 1, grid=grid(1), stream=stream0) | |
del primals_402 | |
buf1189 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_405, buf1189, 1, grid=grid(1), stream=stream0) | |
del primals_405 | |
buf1190 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_408, buf1190, 1, grid=grid(1), stream=stream0) | |
del primals_408 | |
buf1191 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_411, buf1191, 1, grid=grid(1), stream=stream0) | |
del primals_411 | |
buf1192 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_414, buf1192, 1, grid=grid(1), stream=stream0) | |
del primals_414 | |
buf1193 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_417, buf1193, 1, grid=grid(1), stream=stream0) | |
del primals_417 | |
buf1194 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_420, buf1194, 1, grid=grid(1), stream=stream0) | |
del primals_420 | |
buf1195 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_423, buf1195, 1, grid=grid(1), stream=stream0) | |
del primals_423 | |
buf1196 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_426, buf1196, 1, grid=grid(1), stream=stream0) | |
del primals_426 | |
buf1197 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_429, buf1197, 1, grid=grid(1), stream=stream0) | |
del primals_429 | |
buf1198 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_432, buf1198, 1, grid=grid(1), stream=stream0) | |
del primals_432 | |
buf1199 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_435, buf1199, 1, grid=grid(1), stream=stream0) | |
del primals_435 | |
buf1200 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_438, buf1200, 1, grid=grid(1), stream=stream0) | |
del primals_438 | |
buf1201 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_441, buf1201, 1, grid=grid(1), stream=stream0) | |
del primals_441 | |
buf1202 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_444, buf1202, 1, grid=grid(1), stream=stream0) | |
del primals_444 | |
buf1203 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_447, buf1203, 1, grid=grid(1), stream=stream0) | |
del primals_447 | |
buf1204 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_450, buf1204, 1, grid=grid(1), stream=stream0) | |
del primals_450 | |
buf1205 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_453, buf1205, 1, grid=grid(1), stream=stream0) | |
del primals_453 | |
buf1206 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_456, buf1206, 1, grid=grid(1), stream=stream0) | |
del primals_456 | |
buf1207 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_459, buf1207, 1, grid=grid(1), stream=stream0) | |
del primals_459 | |
buf1208 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_462, buf1208, 1, grid=grid(1), stream=stream0) | |
del primals_462 | |
buf1209 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_465, buf1209, 1, grid=grid(1), stream=stream0) | |
del primals_465 | |
buf1210 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_468, buf1210, 1, grid=grid(1), stream=stream0) | |
del primals_468 | |
buf1211 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_471, buf1211, 1, grid=grid(1), stream=stream0) | |
del primals_471 | |
buf1212 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_474, buf1212, 1, grid=grid(1), stream=stream0) | |
del primals_474 | |
buf1213 = empty_strided((), (), device='cuda', dtype=torch.int64) | |
triton_poi_fused_add_233.run(primals_477, buf1213, 1, grid=grid(1), stream=stream0) | |
del primals_477 | |
return (buf1156, buf9, buf10, buf1157, buf20, buf21, buf1158, buf31, buf32, buf1159, buf49, buf50, buf1160, buf68, buf69, buf1161, buf84, buf85, buf1162, buf100, buf101, buf1163, buf111, buf112, buf1164, buf127, buf128, buf1165, buf136, buf137, buf1166, buf160, buf161, buf1167, buf183, buf184, buf1168, buf197, buf198, buf1169, buf213, buf214, buf1170, buf243, buf244, buf1171, buf257, buf258, buf1172, buf273, buf274, buf1173, buf303, buf304, buf1174, buf317, buf318, buf1175, buf333, buf334, buf1176, buf363, buf364, buf1177, buf372, buf373, buf1178, buf392, buf393, buf1179, buf415, buf416, buf1180, buf429, buf430, buf1181, buf453, buf454, buf1182, buf483, buf484, buf1183, buf497, buf498, buf1184, buf521, buf522, buf1185, buf551, buf552, buf1186, buf565, buf566, buf1187, buf589, buf590, buf1188, buf619, buf620, buf1189, buf628, buf629, buf1190, buf638, buf639, buf1191, buf661, buf662, buf1192, buf675, buf676, buf1193, buf699, buf700, buf1194, buf729, buf730, buf1195, buf743, buf744, buf1196, buf767, buf768, buf1197, buf797, buf798, buf1198, buf811, buf812, buf1199, buf835, buf836, buf1200, buf865, buf866, buf1201, buf874, buf875, buf1202, buf898, buf899, buf1203, buf919, buf920, buf1204, buf928, buf929, buf1205, buf952, buf953, buf1206, buf980, buf981, buf1207, buf989, buf990, buf1208, buf1013, buf1014, buf1209, buf1041, buf1042, buf1210, buf1050, buf1051, buf1211, buf1074, buf1075, buf1212, buf1102, buf1103, buf1213, buf1111, buf1112, buf1118, primals_1, primals_3, primals_5, primals_7, primals_9, primals_11, primals_13, primals_15, primals_17, primals_19, primals_21, primals_23, primals_25, primals_27, primals_29, primals_31, primals_33, primals_35, primals_37, primals_39, primals_41, primals_43, primals_45, primals_47, primals_49, primals_51, primals_53, primals_55, primals_57, primals_59, primals_61, primals_63, primals_65, primals_67, primals_69, primals_71, primals_73, primals_75, primals_77, primals_79, primals_81, primals_83, primals_85, primals_87, primals_89, primals_91, primals_93, primals_95, primals_97, primals_99, primals_101, primals_103, primals_105, primals_107, primals_109, primals_111, primals_113, primals_115, buf0, buf1, buf2, buf8, buf11, buf12, buf13, buf19, buf22, buf23, buf24, buf30, buf34, buf35, buf36, buf38, buf42, buf48, buf52, as_strided(buf51, (128, 64, 112, 112), (2408448, 12544, 112, 1)), buf54, as_strided(buf51, (128, 64, 112, 112), (2408448, 12544, 112, 1), 802816), buf56, as_strided(buf51, (128, 64, 112, 112), (2408448, 12544, 112, 1), 1605632), buf61, buf67, buf71, as_strided(buf70, (128, 96, 56, 56), (602112, 3136, 56, 1)), buf73, as_strided(buf70, (128, 96, 56, 56), (602112, 3136, 56, 1), 301056), buf77, buf83, as_strided(buf86, (128, 20, 56, 56), (125440, 3136, 56, 1)), as_strided(buf86, (128, 20, 56, 56), (125440, 3136, 56, 1), 62720), buf87, buf89, buf93, buf99, buf102, buf103, buf104, buf110, buf114, as_strided(buf113, (128, 60, 56, 56), (376320, 3136, 56, 1)), buf116, as_strided(buf113, (128, 60, 56, 56), (376320, 3136, 56, 1), 188160), buf120, buf126, buf129, buf130, buf131, buf135, buf139, buf140, buf142, buf143, buf145, buf146, buf148, buf149, buf155, buf159, buf162, buf164, buf165, buf168, buf169, buf170, buf173, buf174, buf175, buf176, buf182, as_strided(buf185, (128, 28, 28, 28), (43904, 784, 28, 1)), as_strided(buf185, (128, 28, 28, 28), (43904, 784, 28, 1), 21952), buf186, buf188, buf192, buf196, buf200, buf201, buf203, buf204, buf208, buf212, buf215, buf217, buf218, buf221, buf222, buf223, buf226, buf228, buf229, buf230, buf232, buf236, buf242, as_strided(buf245, (128, 28, 28, 28), (43904, 784, 28, 1)), as_strided(buf245, (128, 28, 28, 28), (43904, 784, 28, 1), 21952), buf246, buf248, buf252, buf256, buf260, buf261, buf263, buf264, buf268, buf272, buf275, buf277, buf278, buf281, buf282, buf283, buf286, buf288, buf289, buf290, buf292, buf296, buf302, as_strided(buf305, (128, 28, 28, 28), (43904, 784, 28, 1)), as_strided(buf305, (128, 28, 28, 28), (43904, 784, 28, 1), 21952), buf306, buf308, buf312, buf316, buf320, buf321, buf323, buf324, buf328, buf332, buf335, buf337, buf338, buf341, buf342, buf343, buf346, buf348, buf349, buf350, buf352, buf356, buf362, buf365, buf366, buf367, buf371, buf375, buf376, buf378, buf379, buf381, buf382, buf387, buf391, buf394, buf396, buf397, buf400, buf401, buf402, buf405, buf406, buf407, buf408, buf414, as_strided(buf417, (128, 52, 14, 14), (20384, 196, 14, 1)), as_strided(buf417, (128, 52, 14, 14), (20384, 196, 14, 1), 10192), buf418, buf420, buf424, buf428, buf432, buf433, buf435, buf436, buf438, buf439, buf441, buf442, buf448, buf452, buf455, buf457, buf458, buf461, buf462, buf463, buf466, buf468, buf469, buf470, buf472, buf476, buf482, as_strided(buf485, (128, 52, 14, 14), (20384, 196, 14, 1)), as_strided(buf485, (128, 52, 14, 14), (20384, 196, 14, 1), 10192), buf486, buf488, buf492, buf496, buf500, buf501, buf503, buf504, buf506, buf507, buf509, buf510, buf516, buf520, buf523, buf525, buf526, buf529, buf530, buf531, buf534, buf536, buf537, buf538, buf540, buf544, buf550, as_strided(buf553, (128, 52, 14, 14), (20384, 196, 14, 1)), as_strided(buf553, (128, 52, 14, 14), (20384, 196, 14, 1), 10192), buf554, buf556, buf560, buf564, buf568, buf569, buf571, buf572, buf574, buf575, buf577, buf578, buf584, buf588, buf591, buf593, buf594, buf597, buf598, buf599, buf602, buf604, buf605, buf606, buf608, buf612, buf618, buf621, buf622, buf623, buf627, buf631, buf632, buf633, buf637, buf640, buf642, buf643, buf646, buf647, buf648, buf651, buf652, buf653, buf654, buf660, as_strided(buf663, (128, 80, 14, 14), (31360, 196, 14, 1)), as_strided(buf663, (128, 80, 14, 14), (31360, 196, 14, 1), 15680), buf664, buf666, buf670, buf674, buf678, buf679, buf681, buf682, buf684, buf685, buf687, buf688, buf694, buf698, buf701, buf703, buf704, buf707, buf708, buf709, buf712, buf714, buf715, buf716, buf718, buf722, buf728, as_strided(buf731, (128, 80, 14, 14), (31360, 196, 14, 1)), as_strided(buf731, (128, 80, 14, 14), (31360, 196, 14, 1), 15680), buf732, buf734, buf738, buf742, buf746, buf747, buf749, buf750, buf752, buf753, buf755, buf756, buf762, buf766, buf769, buf771, buf772, buf775, buf776, buf777, buf780, buf782, buf783, buf784, buf786, buf790, buf796, as_strided(buf799, (128, 80, 14, 14), (31360, 196, 14, 1)), as_strided(buf799, (128, 80, 14, 14), (31360, 196, 14, 1), 15680), buf800, buf802, buf806, buf810, buf814, buf815, buf817, buf818, buf820, buf821, buf823, buf824, buf830, buf834, buf837, buf839, buf840, buf843, buf844, buf845, buf848, buf850, buf851, buf852, buf854, buf858, buf864, buf867, buf868, buf869, buf873, buf877, buf878, buf880, buf881, buf883, buf884, buf886, buf887, buf893, buf897, buf900, buf902, buf903, buf906, buf907, buf908, buf911, buf912, buf913, buf914, buf918, buf921, buf922, buf923, buf927, buf931, buf932, buf934, buf935, buf937, buf938, buf940, buf941, buf947, buf951, buf954, buf956, buf957, buf960, buf961, buf962, buf965, buf967, buf968, buf969, buf971, buf975, buf979, buf982, buf983, buf984, buf988, buf992, buf993, buf995, buf996, buf998, buf999, buf1001, buf1002, buf1008, buf1012, buf1015, buf1017, buf1018, buf1021, buf1022, buf1023, buf1026, buf1028, buf1029, buf1030, buf1032, buf1036, buf1040, buf1043, buf1044, buf1045, buf1049, buf1053, buf1054, buf1056, buf1057, buf1059, buf1060, buf1062, buf1063, buf1069, buf1073, buf1076, buf1078, buf1079, buf1082, buf1083, buf1084, buf1087, buf1089, buf1090, buf1091, buf1093, buf1097, buf1101, buf1104, buf1105, buf1106, buf1110, buf1115, buf1119, buf1120, as_strided(buf1108, (1, 1536, 1, 1), (1536, 1, 1, 1)), as_strided(buf1099, (1, 264, 1, 1), (264, 1, 1, 1)), as_strided(buf1071, (1, 1584, 1, 1), (1584, 1, 1, 1)), buf1122, as_strided(buf1047, (1, 1584, 1, 1), (1584, 1, 1, 1)), as_strided(buf1038, (1, 264, 1, 1), (264, 1, 1, 1)), as_strided(buf1010, (1, 1584, 1, 1), (1584, 1, 1, 1)), buf1124, as_strided(buf986, (1, 1584, 1, 1), (1584, 1, 1, 1)), as_strided(buf977, (1, 264, 1, 1), (264, 1, 1, 1)), as_strided(buf949, (1, 1584, 1, 1), (1584, 1, 1, 1)), buf1126, as_strided(buf925, (1, 1584, 1, 1), (1584, 1, 1, 1)), as_strided(buf916, (1, 264, 1, 1), (264, 1, 1, 1)), as_strided(buf895, (1, 960, 1, 1), (960, 1, 1, 1)), buf1128, as_strided(buf871, (1, 960, 1, 1), (960, 1, 1, 1)), as_strided(buf861, (1, 160, 1, 1), (160, 1, 1, 1)), as_strided(buf832, (1, 480, 1, 1), (480, 1, 1, 1)), buf1130, as_strided(buf808, (1, 480, 1, 1), (480, 1, 1, 1)), as_strided(buf793, (1, 160, 1, 1), (160, 1, 1, 1)), as_strided(buf764, (1, 480, 1, 1), (480, 1, 1, 1)), buf1132, as_strided(buf740, (1, 480, 1, 1), (480, 1, 1, 1)), as_strided(buf725, (1, 160, 1, 1), (160, 1, 1, 1)), as_strided(buf696, (1, 480, 1, 1), (480, 1, 1, 1)), buf1134, as_strided(buf672, (1, 480, 1, 1), (480, 1, 1, 1)), as_strided(buf657, (1, 160, 1, 1), (160, 1, 1, 1)), as_strided(buf635, (1, 624, 1, 1), (624, 1, 1, 1)), buf1136, as_strided(buf625, (1, 624, 1, 1), (624, 1, 1, 1)), as_strided(buf615, (1, 104, 1, 1), (104, 1, 1, 1)), as_strided(buf586, (1, 624, 1, 1), (624, 1, 1, 1)), buf1138, as_strided(buf562, (1, 624, 1, 1), (624, 1, 1, 1)), as_strided(buf547, (1, 104, 1, 1), (104, 1, 1, 1)), as_strided(buf518, (1, 624, 1, 1), (624, 1, 1, 1)), buf1140, as_strided(buf494, (1, 624, 1, 1), (624, 1, 1, 1)), as_strided(buf479, (1, 104, 1, 1), (104, 1, 1, 1)), as_strided(buf450, (1, 624, 1, 1), (624, 1, 1, 1)), buf1142, as_strided(buf426, (1, 624, 1, 1), (624, 1, 1, 1)), as_strided(buf411, (1, 104, 1, 1), (104, 1, 1, 1)), as_strided(buf389, (1, 336, 1, 1), (336, 1, 1, 1)), buf1144, as_strided(buf369, (1, 336, 1, 1), (336, 1, 1, 1)), as_strided(buf359, (1, 56, 1, 1), (56, 1, 1, 1)), as_strided(buf330, (1, 336, 1, 1), (336, 1, 1, 1)), buf1146, as_strided(buf314, (1, 336, 1, 1), (336, 1, 1, 1)), as_strided(buf299, (1, 56, 1, 1), (56, 1, 1, 1)), as_strided(buf270, (1, 336, 1, 1), (336, 1, 1, 1)), buf1148, as_strided(buf254, (1, 336, 1, 1), (336, 1, 1, 1)), as_strided(buf239, (1, 56, 1, 1), (56, 1, 1, 1)), as_strided(buf210, (1, 336, 1, 1), (336, 1, 1, 1)), buf1150, as_strided(buf194, (1, 336, 1, 1), (336, 1, 1, 1)), as_strided(buf179, (1, 56, 1, 1), (56, 1, 1, 1)), as_strided(buf157, (1, 240, 1, 1), (240, 1, 1, 1)), buf1152, as_strided(buf133, (1, 240, 1, 1), (240, 1, 1, 1)), as_strided(buf123, (1, 40, 1, 1), (40, 1, 1, 1)), buf1153, as_strided(buf107, (1, 120, 1, 1), (120, 1, 1, 1)), as_strided(buf96, (1, 120, 1, 1), (120, 1, 1, 1)), as_strided(buf80, (1, 40, 1, 1), (40, 1, 1, 1)), buf1154, as_strided(buf64, (1, 192, 1, 1), (192, 1, 1, 1)), buf1155, as_strided(buf45, (1, 192, 1, 1), (192, 1, 1, 1)), as_strided(buf27, (1, 32, 1, 1), (32, 1, 1, 1)), as_strided(buf16, (1, 32, 1, 1), (32, 1, 1, 1)), as_strided(buf5, (1, 32, 1, 1), (32, 1, 1, 1)), ) | |
def benchmark_compiled_module(times=10, repeat=10): | |
from torch._dynamo.testing import rand_strided | |
from torch._inductor.utils import print_performance | |
primals_1 = rand_strided((32, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_2 = rand_strided((32, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_3 = rand_strided((32, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_4 = rand_strided((32, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_5 = rand_strided((32, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_6 = rand_strided((32, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_7 = rand_strided((192, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_8 = rand_strided((192, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_9 = rand_strided((192, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_10 = rand_strided((192, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_11 = rand_strided((40, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_12 = rand_strided((40, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_13 = rand_strided((120, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_14 = rand_strided((120, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_15 = rand_strided((120, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_16 = rand_strided((120, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_17 = rand_strided((40, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_18 = rand_strided((40, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_19 = rand_strided((240, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_20 = rand_strided((240, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_21 = rand_strided((240, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_22 = rand_strided((240, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_23 = rand_strided((56, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_24 = rand_strided((56, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_25 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_26 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_27 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_28 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_29 = rand_strided((56, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_30 = rand_strided((56, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_31 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_32 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_33 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_34 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_35 = rand_strided((56, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_36 = rand_strided((56, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_37 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_38 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_39 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_40 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_41 = rand_strided((56, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_42 = rand_strided((56, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_43 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_44 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_45 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_46 = rand_strided((336, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_47 = rand_strided((104, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_48 = rand_strided((104, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_49 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_50 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_51 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_52 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_53 = rand_strided((104, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_54 = rand_strided((104, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_55 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_56 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_57 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_58 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_59 = rand_strided((104, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_60 = rand_strided((104, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_61 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_62 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_63 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_64 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_65 = rand_strided((104, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_66 = rand_strided((104, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_67 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_68 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_69 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_70 = rand_strided((624, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_71 = rand_strided((160, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_72 = rand_strided((160, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_73 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_74 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_75 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_76 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_77 = rand_strided((160, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_78 = rand_strided((160, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_79 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_80 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_81 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_82 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_83 = rand_strided((160, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_84 = rand_strided((160, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_85 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_86 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_87 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_88 = rand_strided((480, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_89 = rand_strided((160, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_90 = rand_strided((160, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_91 = rand_strided((960, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_92 = rand_strided((960, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_93 = rand_strided((960, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_94 = rand_strided((960, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_95 = rand_strided((264, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_96 = rand_strided((264, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_97 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_98 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_99 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_100 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_101 = rand_strided((264, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_102 = rand_strided((264, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_103 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_104 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_105 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_106 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_107 = rand_strided((264, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_108 = rand_strided((264, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_109 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_110 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_111 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_112 = rand_strided((1584, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_113 = rand_strided((264, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_114 = rand_strided((264, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_115 = rand_strided((1536, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_116 = rand_strided((1536, ), (1, ), device='cuda:0', dtype=torch.float32) | |
primals_117 = rand_strided((32, 3, 3, 3), (27, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
primals_118 = rand_strided((32, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
primals_119 = rand_strided((32, 32, 1, 1), (32, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
primals_120 = rand_strided((96, 16, 1, 1), (16, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
primals_121 = rand_strided((96, 16, 1, 1), (16, 1, 1, 1), device='cuda:0', dtype=torch.float32) | |
primals_122 = rand_strided((64, 1, 3, 3), (9, 9, 3, 1), device='cuda:0', dtype=torch.float32) | |
primals_123 = rand_strided((64, 1, 5, 5), (25, 25, 5, 1), device='cuda:0', dty |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment