Skip to content

Instantly share code, notes, and snippets.

@asmeurer
Last active November 30, 2023 23:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save asmeurer/0dae083265e52973ceb2f7fb5385b407 to your computer and use it in GitHub Desktop.
Save asmeurer/0dae083265e52973ceb2f7fb5385b407 to your computer and use it in GitHub Desktop.
take decomposition benchmark
(262144,)
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] Output code:
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] from ctypes import c_void_p, c_long
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] import torch
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] import math
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] import random
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] import os
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] import tempfile
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] from math import inf, nan
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.hooks import run_intermediate_hooks
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.utils import maybe_profile
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.codegen.memory_planning import _align as align
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] from torch import device, empty, empty_strided
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.codecache import AsyncCompile
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.select_algorithm import extern_kernels
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] aten = torch.ops.aten
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] inductor_ops = torch.ops.inductor
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] assert_size_stride = torch._C._dynamo.guards.assert_size_stride
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] alloc_from_pool = torch.ops.inductor._alloc_from_pool
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] async_compile = AsyncCompile()
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] # kernel path: /tmp/torchinductor_aaronmeurer/a5/ca5pyuhgulzjwes4oyx2pviqibotyvsjtojgadhijpeyc547sgah.py
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] # Source Nodes: [], Original ATen: []
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] triton_poi_fused_0 = async_compile.triton('triton_', '''
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] import triton
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] import triton.language as tl
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.ir import ReductionHint
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.ir import TileHint
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.triton_heuristics import AutotuneHint, pointwise
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.utils import instance_descriptor
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor import triton_helpers
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] @pointwise(
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] size_hints=[512],
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] filename=__file__,
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] triton_meta={'signature': {0: '*i64', 1: '*fp32', 2: '*fp32', 3: 'i32'}, 'device': 0, 'device_type': 'cuda', 'constants': {}, 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=(), ids_of_folded_args=(), divisible_by_8=(3,))]},
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_0', 'mutated_arg_names': []},
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] min_elem_per_thread=0
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] )
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] @triton.jit
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] def triton_(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] xnumel = 512
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] xoffset = tl.program_id(0) * XBLOCK
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] xindex = xoffset + tl.arange(0, XBLOCK)[:]
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] xmask = xindex < xnumel
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] x0 = xindex
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] tmp0 = tl.load(in_ptr0 + (x0), xmask)
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] tmp1 = tl.full([1], 0, tl.int64)
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] tmp2 = tmp0 < tmp1
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] tmp3 = tl.full([1], 262144, tl.int64)
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] tmp4 = tmp0 + tmp3
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] tmp5 = tl.where(tmp2, tmp4, tmp0)
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] tmp6 = tmp5 + 262144
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] tmp7 = tmp5 < 0
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] tmp8 = tl.where(tmp7, tmp6, tmp5)
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] tl.device_assert((0 <= tmp8) & (tmp8 < 262144), "index out of bounds: 0 <= tmp8 < 262144")
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] tmp9 = tl.load(in_ptr1 + (tmp8), None, eviction_policy='evict_last')
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] tl.store(out_ptr0 + (x0), tmp9, xmask)
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] ''')
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] import triton
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] import triton.language as tl
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.triton_heuristics import grid, start_graph, end_graph
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] async_compile.wait(globals())
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] del async_compile
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] def call(args):
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] args_1, args_2 = args
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] args.clear()
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] assert_size_stride(args_1, (262144, ), (1, ))
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] assert_size_stride(args_2, (512, ), (1, ))
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] with torch.cuda._DeviceGuard(0):
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] torch.cuda.set_device(0) # no-op to ensure context
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] buf0 = empty((512, ), device='cuda', dtype=torch.float32)
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] # Source Nodes: [], Original ATen: []
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] stream0 = get_cuda_stream(0)
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] triton_poi_fused_0.run(args_2, args_1, buf0, 512, grid=grid(512), stream=stream0)
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] del args_1
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] del args_2
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] return (buf0, )
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] def benchmark_compiled_module(times=10, repeat=10):
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] from torch._dynamo.testing import rand_strided
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.utils import print_performance
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] args_1 = rand_strided((262144, ), (1, ), device='cuda:0', dtype=torch.float32)
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] args_2 = rand_strided((512, ), (1, ), device='cuda:0', dtype=torch.int64)
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] return print_performance(lambda: call([args_1, args_2]), times=times, repeat=repeat)
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] if __name__ == "__main__":
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.wrapper_benchmark import compiled_module_main
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG] compiled_module_main('None', benchmark_compiled_module)
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 15:32:07,909] torch._inductor.graph.__output_code: [INFO] Output code written to: /tmp/torchinductor_aaronmeurer/q4/cq4acxiaqi53ixib6hylu7capcu3ezkrekx67yyivtlb3pfozlfy.py
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] Output code:
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] from ctypes import c_void_p, c_long
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] import torch
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] import math
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] import random
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] import os
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] import tempfile
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] from math import inf, nan
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.hooks import run_intermediate_hooks
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.utils import maybe_profile
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.codegen.memory_planning import _align as align
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] from torch import device, empty, empty_strided
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.codecache import AsyncCompile
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.select_algorithm import extern_kernels
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] aten = torch.ops.aten
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] inductor_ops = torch.ops.inductor
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] assert_size_stride = torch._C._dynamo.guards.assert_size_stride
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] alloc_from_pool = torch.ops.inductor._alloc_from_pool
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] async_compile = AsyncCompile()
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] async_compile.wait(globals())
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] del async_compile
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] def call(args):
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] args_1, args_2 = args
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] args.clear()
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] assert_size_stride(args_1, (262144, ), (1, ))
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] assert_size_stride(args_2, (512, ), (1, ))
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] with torch.cuda._DeviceGuard(0):
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] torch.cuda.set_device(0) # no-op to ensure context
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] # Source Nodes: [], Original ATen: []
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] buf0 = aten.take(args_1, args_2)
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] del args_1
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] del args_2
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] buf1 = buf0
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] return (buf1, )
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] def benchmark_compiled_module(times=10, repeat=10):
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] from torch._dynamo.testing import rand_strided
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.utils import print_performance
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] args_1 = rand_strided((262144, ), (1, ), device='cuda:0', dtype=torch.float32)
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] args_2 = rand_strided((512, ), (1, ), device='cuda:0', dtype=torch.int64)
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] return print_performance(lambda: call([args_1, args_2]), times=times, repeat=repeat)
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] if __name__ == "__main__":
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.wrapper_benchmark import compiled_module_main
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG] compiled_module_main('None', benchmark_compiled_module)
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 15:32:10,205] torch._inductor.graph.__output_code: [INFO] Output code written to: /tmp/torchinductor_aaronmeurer/dc/cdckfb6dlcxhn2bv6jzgsxwlp5ectha6f3dkh23i2e2whg4ztkuf.py
[----------------------- take ----------------------]
| Decomposed | Lowering | Eager
1 threads: ------------------------------------------
(262144,) | 18 | 14 | 11
Times are in microseconds (us).
(262144,)
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] Output code:
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] from ctypes import c_void_p, c_long
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] import torch
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] import math
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] import random
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] import os
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] import tempfile
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] from math import inf, nan
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.hooks import run_intermediate_hooks
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.utils import maybe_profile
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.codegen.memory_planning import _align as align
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] from torch import device, empty, empty_strided
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.codecache import AsyncCompile
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.select_algorithm import extern_kernels
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] aten = torch.ops.aten
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] inductor_ops = torch.ops.inductor
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] assert_size_stride = torch._C._dynamo.guards.assert_size_stride
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] alloc_from_pool = torch.ops.inductor._alloc_from_pool
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] async_compile = AsyncCompile()
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] # kernel path: /tmp/torchinductor_aaronmeurer/fw/cfw5p7jmks5a526rt52rto3m3i47anqafzqnwokhxvjqgbdkxxyu.py
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] # Source Nodes: [], Original ATen: []
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] triton_poi_fused_0 = async_compile.triton('triton_', '''
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] import triton
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] import triton.language as tl
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.ir import ReductionHint
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.ir import TileHint
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.triton_heuristics import AutotuneHint, pointwise
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.utils import instance_descriptor
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor import triton_helpers
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] @pointwise(
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] size_hints=[512],
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] filename=__file__,
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] triton_meta={'signature': {0: '*i64', 1: '*fp32', 2: '*fp32', 3: 'i32'}, 'device': 0, 'device_type': 'cuda', 'constants': {}, 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=(), ids_of_folded_args=(), divisible_by_8=(3,))]},
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_0', 'mutated_arg_names': []},
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] min_elem_per_thread=0
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] )
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] @triton.jit
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] def triton_(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] xnumel = 512
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] xoffset = tl.program_id(0) * XBLOCK
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] xindex = xoffset + tl.arange(0, XBLOCK)[:]
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] xmask = xindex < xnumel
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] x0 = xindex
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] tmp0 = tl.load(in_ptr0 + (x0), xmask)
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] tmp1 = tmp0 + 262144
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] tmp2 = tmp0 < 0
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] tmp3 = tl.where(tmp2, tmp1, tmp0)
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] tl.device_assert(((0 <= tmp3) & (tmp3 < 262144)) | ~xmask, "index out of bounds: 0 <= tmp3 < 262144")
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] tmp4 = tl.load(in_ptr1 + (tmp3), xmask, eviction_policy='evict_last')
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] tl.store(out_ptr0 + (x0), tmp4, xmask)
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] ''')
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] import triton
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] import triton.language as tl
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.triton_heuristics import grid, start_graph, end_graph
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] async_compile.wait(globals())
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] del async_compile
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] def call(args):
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] args_1, args_2 = args
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] args.clear()
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] assert_size_stride(args_1, (262144, ), (1, ))
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] assert_size_stride(args_2, (512, ), (1, ))
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] with torch.cuda._DeviceGuard(0):
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] torch.cuda.set_device(0) # no-op to ensure context
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] buf0 = empty((512, ), device='cuda', dtype=torch.float32)
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] # Source Nodes: [], Original ATen: []
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] stream0 = get_cuda_stream(0)
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] triton_poi_fused_0.run(args_2, args_1, buf0, 512, grid=grid(512), stream=stream0)
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] del args_1
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] del args_2
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] return (buf0, )
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] def benchmark_compiled_module(times=10, repeat=10):
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] from torch._dynamo.testing import rand_strided
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.utils import print_performance
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] args_1 = rand_strided((262144, ), (1, ), device='cuda:0', dtype=torch.float32)
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] args_2 = rand_strided((512, ), (1, ), device='cuda:0', dtype=torch.int64)
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] return print_performance(lambda: call([args_1, args_2]), times=times, repeat=repeat)
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] if __name__ == "__main__":
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.wrapper_benchmark import compiled_module_main
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG] compiled_module_main('None', benchmark_compiled_module)
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 16:00:19,972] torch._inductor.graph.__output_code: [INFO] Output code written to: /tmp/torchinductor_aaronmeurer/se/cseo4jhcpr2ugwbf5ewapghbxwb6pdonaztarannlhupvwril2lj.py
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] Output code:
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] from ctypes import c_void_p, c_long
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] import torch
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] import math
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] import random
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] import os
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] import tempfile
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] from math import inf, nan
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.hooks import run_intermediate_hooks
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.utils import maybe_profile
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.codegen.memory_planning import _align as align
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] from torch import device, empty, empty_strided
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.codecache import AsyncCompile
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.select_algorithm import extern_kernels
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] aten = torch.ops.aten
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] inductor_ops = torch.ops.inductor
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] assert_size_stride = torch._C._dynamo.guards.assert_size_stride
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] alloc_from_pool = torch.ops.inductor._alloc_from_pool
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] async_compile = AsyncCompile()
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] async_compile.wait(globals())
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] del async_compile
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] def call(args):
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] args_1, args_2 = args
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] args.clear()
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] assert_size_stride(args_1, (262144, ), (1, ))
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] assert_size_stride(args_2, (512, ), (1, ))
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] with torch.cuda._DeviceGuard(0):
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] torch.cuda.set_device(0) # no-op to ensure context
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] # Source Nodes: [], Original ATen: []
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] buf0 = aten.take(args_1, args_2)
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] del args_1
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] del args_2
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] buf1 = buf0
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] return (buf1, )
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] def benchmark_compiled_module(times=10, repeat=10):
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] from torch._dynamo.testing import rand_strided
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.utils import print_performance
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] args_1 = rand_strided((262144, ), (1, ), device='cuda:0', dtype=torch.float32)
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] args_2 = rand_strided((512, ), (1, ), device='cuda:0', dtype=torch.int64)
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] return print_performance(lambda: call([args_1, args_2]), times=times, repeat=repeat)
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] if __name__ == "__main__":
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] from torch._inductor.wrapper_benchmark import compiled_module_main
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG] compiled_module_main('None', benchmark_compiled_module)
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [DEBUG]
[2023-11-30 16:00:22,268] torch._inductor.graph.__output_code: [INFO] Output code written to: /tmp/torchinductor_aaronmeurer/dc/cdckfb6dlcxhn2bv6jzgsxwlp5ectha6f3dkh23i2e2whg4ztkuf.py
[----------------------- take ----------------------]
| Decomposed | Lowering | Eager
1 threads: ------------------------------------------
(262144,) | 18 | 14 | 12
Times are in microseconds (us).
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment