This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# | |
# | |
# | |
# | |
# llvm-dwarfdump output for hsaco: | |
# | |
# | |
# |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ python python/tutorials/01-vector-add.py | |
// -----// IR Dump Before Inliner (inline) ('builtin.module' operation) //----- // | |
#loc = loc("/data/users/plotfi/triton-ziteng/triton/python/tutorials/01-vector-add.py":30:0) | |
module { | |
tt.func public @add_kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("/data/users/plotfi/triton-ziteng/triton/python/tutorials/01-vector-add.py":30:0), %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("/data/users/plotfi/triton-ziteng/triton/python/tutorials/01-vector-add.py":30:0), %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("/data/users/plotfi/triton-ziteng/triton/python/tutorials/01-vector-add.py":30:0), %arg3: i32 {tt.divisibility = 16 : i32} loc("/data/users/plotfi/triton-ziteng/triton/python/tutorials/01-vector-add.py":30:0)) attributes {noinline = false} { | |
%pid = tt.get_program_id x : i32 loc(#loc14) | |
%block_start = arith.constant 1024 : i32 loc(#loc15) | |
%block_start_0 = arith.constant 1024 : i32 loc(#loc15) | |
%block_start_1 = arith.extsi % |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import triton | |
import triton.language as tl | |
from triton.compiler.compiler import AttrsDescriptor | |
from torch._inductor.runtime import triton_helpers, triton_heuristics | |
from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
triton_helpers.set_driver_to_gpu() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import torch._inductor.runtime.triton_helpers as triton_helpers | |
import triton | |
import triton.language as tl | |
from torch._inductor.runtime.triton_helpers import math as tl_math | |
@triton.jit | |
def triton_33(in_out_ptr0, in_ptr0, xnumel, rnumel, XBLOCK: tl.constexpr): | |
xnumel = 1016800 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
H100, ragged-hstu-attention-b512-h4-d64-v64-bTrue-sparsity0.95-fwd-torch.bfloat16-targetsize20-maxattn0 | |
_ragged_hstu_attn_fwd, seq_len: 256, num_stages: 1, Ragged: 0.288960 | |
BLOCK_M, BLOCK_N, enable_tw_preload, enable_pw_preload, num_warps, num_stages, autotune run #1, autotune run #2, autotune run #3 | |
64, 32, False, False, 4, 4, 0.28891199827194214, 0.28832000494003296, 0.2895680069923401 | |
64, 64, True, True, 4, 2, 0.32230401039123535, 0.3219839930534363, 0.3227519989013672 | |
64, 64, True, True, 4, 4, 0.3280639946460724, 0.3275200128555298, 0.32864001393318176 | |
32, 32, True, True, 4, 4, 0.32927998900413513, 0.32859519124031067, 0.3299199938774109 | |
64, 64, False, False, 4, 4, 0.3339039981365204, 0.3333120048046112, 0.3345920145511627 | |
64, 64, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Printing ALL Multiple Triton autotuning Configs with timings in sorted order for kernel JITFunction(hammer.generative_recommenders.ops.triton.triton_ragged_hstu_attention:_ragged_hstu_attn_fwd): | |
Triton autotune config: [BLOCK_M: 64, BLOCK_N: 32, enable_tw_preload: False, enable_pw_preload: False, num_warps: 4, num_ctas: 1, num_stages: 4, maxnreg: None]; Triton autotune timing: [0.28891199827194214, 0.28832000494003296, 0.2895680069923401] | |
Triton autotune config: [BLOCK_M: 64, BLOCK_N: 64, enable_tw_preload: True, enable_pw_preload: True, num_warps: 4, num_ctas: 1, num_stages: 2, maxnreg: None]; Triton autotune timing: [0.32230401039123535, 0.3219839930534363, 0.3227519989013672] | |
Triton autotune config: [BLOCK_M: 64, BLOCK_N: 64, enable_tw_preload: True, enable_pw_preload: True, num_warps: 4, num_ctas: 1, num_stages: 4, maxnreg: None]; Triton autotune timing: [0.3280639946460724, 0.3275200128555298, 0.32864001393318176] | |
Triton autotune config: [BLOCK_M: 32, BLOCK_N: 32, enable_tw_preload: True, enable_pw_prel |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
A100, ragged-hstu-attention-b512-h4-d64-v64-bTrue-sparsity0.95-fwd-torch.bfloat16-targetsize20-maxattn0 | |
_ragged_hstu_attn_fwd, seq_len: 256, num_stages: 1, Ragged: 0.541485 | |
BLOCK_M, BLOCK_N, enable_tw_preload, enable_pw_preload, num_warps, num_stages, autotune run #1, autotune run #2, autotune run #3 | |
64, 32, True, True, 4, 4, 0.5415679812431335, 0.5409280061721802, 0.5421760082244873 | |
32, 32, False, False, 4, 4, 0.5554879903793335, 0.5499967932701111, 0.5577791929244995 | |
32, 32, False, False, 4, 2, 0.5859839916229248, 0.5849024057388306, 0.5867136120796204 | |
64, 64, False, False, 4, 2, 0.5874879956245422, 0.5867840051651001, 0.588313639163971 | |
64, 64, False, False, 4, 4, 0.5878080129623413, 0.5868480205535889, 0.5890560150146484 | |
32, 32, True, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[plotfi@devgpu007.cco1 ~/fbsource/fbcode (cccdc8f42)]$ buck2 run @mode/opt -c fbcode.disable_re_tests=True //hammer/ops/benchmarks:ragged_hstu_attention_bench -- --bench-backward False | |
Buck UI: https://www.internalfb.com/buck2/21bc2db6-730a-4d63-8394-41757203dd20 | |
Network: Up: 0B Down: 0B | |
Jobs completed: 18825. Time elapsed: 0.8s. | |
BUILD SUCCEEDED | |
/data/users/plotfi/fbsource/buck-out/v2/gen/fbcode/009ebbab256a7e75/hammer/ops/benchmarks/__ragged_hstu_attention_bench__/ragged_hstu_attention_bench-inplace#link-tree/caffe2/torch/fb/model_transform/splitting/split_dispatcher.py:22: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch. | |
return torch.library.impl_abstract(qualname) | |
INFO:aitemplate.backend.build_cache_base:Build cache disabled | |
INFO:fx2ait.extension:===Load non-OSS AITModel=== | |
INFO:2024-08-21 10:04:29 3561882:3561882 CuptiCallbackApi.cpp:78] Callback: domain = 3, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from torch._inductor.async_compile import AsyncCompile | |
async_compile = AsyncCompile() | |
triton_poi_fused__to_copy_add_0 = async_compile.triton('triton_', ''' | |
import triton | |
import triton.language as tl | |
from triton.compiler.compiler import AttrsDescriptor | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
################################################################################ | |
echo " ______ _ __ ___ _ __ __ ____ _ __ " | |
echo "/_ __/___(_) /____ ___ / _ )__ __(_) /__/ / / __/_______(_)__ / /_ " | |
echo " / / / __/ / __/ _ \/ _ \ / _ / // / / / _ / _\ \/ __/ __/ / _ \/ __/ " | |
echo "/_/ /_/ /_/\__/\___/_//_/ /____/\_,_/_/_/\_,_/ /___/\__/_/ /_/ .__/\__/ " | |
echo " /_/ " | |
################################################################################ |
NewerOlder