Skip to content

Instantly share code, notes, and snippets.

@plotfi
plotfi / gist:333fb05045e96a35e2600bf1e7a076d7
Created August 7, 2025 20:19
softmax_kernel AMD Debug Info
#
#
#
#
#
# llvm-dwarfdump output for hsaco:
#
#
#
$ python python/tutorials/01-vector-add.py
// -----// IR Dump Before Inliner (inline) ('builtin.module' operation) //----- //
#loc = loc("/data/users/plotfi/triton-ziteng/triton/python/tutorials/01-vector-add.py":30:0)
module {
tt.func public @add_kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("/data/users/plotfi/triton-ziteng/triton/python/tutorials/01-vector-add.py":30:0), %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("/data/users/plotfi/triton-ziteng/triton/python/tutorials/01-vector-add.py":30:0), %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("/data/users/plotfi/triton-ziteng/triton/python/tutorials/01-vector-add.py":30:0), %arg3: i32 {tt.divisibility = 16 : i32} loc("/data/users/plotfi/triton-ziteng/triton/python/tutorials/01-vector-add.py":30:0)) attributes {noinline = false} {
%pid = tt.get_program_id x : i32 loc(#loc14)
%block_start = arith.constant 1024 : i32 loc(#loc15)
%block_start_0 = arith.constant 1024 : i32 loc(#loc15)
%block_start_1 = arith.extsi %
@plotfi
plotfi / triton_per_fused_clamp_mul_native_layer_norm_sigmoid_2_unified.py
Created March 24, 2025 19:50
triton_per_fused_clamp_mul_native_layer_norm_sigmoid_2_unified.py
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor
from torch._inductor.runtime import triton_helpers, triton_heuristics
from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
triton_helpers.set_driver_to_gpu()
import torch
import torch._inductor.runtime.triton_helpers as triton_helpers
import triton
import triton.language as tl
from torch._inductor.runtime.triton_helpers import math as tl_math
@triton.jit
def triton_33(in_out_ptr0, in_ptr0, xnumel, rnumel, XBLOCK: tl.constexpr):
xnumel = 1016800
H100, ragged-hstu-attention-b512-h4-d64-v64-bTrue-sparsity0.95-fwd-torch.bfloat16-targetsize20-maxattn0
_ragged_hstu_attn_fwd, seq_len: 256, num_stages: 1, Ragged: 0.288960
BLOCK_M, BLOCK_N, enable_tw_preload, enable_pw_preload, num_warps, num_stages, autotune run #1, autotune run #2, autotune run #3
64, 32, False, False, 4, 4, 0.28891199827194214, 0.28832000494003296, 0.2895680069923401
64, 64, True, True, 4, 2, 0.32230401039123535, 0.3219839930534363, 0.3227519989013672
64, 64, True, True, 4, 4, 0.3280639946460724, 0.3275200128555298, 0.32864001393318176
32, 32, True, True, 4, 4, 0.32927998900413513, 0.32859519124031067, 0.3299199938774109
64, 64, False, False, 4, 4, 0.3339039981365204, 0.3333120048046112, 0.3345920145511627
64, 64,
Printing ALL Multiple Triton autotuning Configs with timings in sorted order for kernel JITFunction(hammer.generative_recommenders.ops.triton.triton_ragged_hstu_attention:_ragged_hstu_attn_fwd):
Triton autotune config: [BLOCK_M: 64, BLOCK_N: 32, enable_tw_preload: False, enable_pw_preload: False, num_warps: 4, num_ctas: 1, num_stages: 4, maxnreg: None]; Triton autotune timing: [0.28891199827194214, 0.28832000494003296, 0.2895680069923401]
Triton autotune config: [BLOCK_M: 64, BLOCK_N: 64, enable_tw_preload: True, enable_pw_preload: True, num_warps: 4, num_ctas: 1, num_stages: 2, maxnreg: None]; Triton autotune timing: [0.32230401039123535, 0.3219839930534363, 0.3227519989013672]
Triton autotune config: [BLOCK_M: 64, BLOCK_N: 64, enable_tw_preload: True, enable_pw_preload: True, num_warps: 4, num_ctas: 1, num_stages: 4, maxnreg: None]; Triton autotune timing: [0.3280639946460724, 0.3275200128555298, 0.32864001393318176]
Triton autotune config: [BLOCK_M: 32, BLOCK_N: 32, enable_tw_preload: True, enable_pw_prel
A100, ragged-hstu-attention-b512-h4-d64-v64-bTrue-sparsity0.95-fwd-torch.bfloat16-targetsize20-maxattn0
_ragged_hstu_attn_fwd, seq_len: 256, num_stages: 1, Ragged: 0.541485
BLOCK_M, BLOCK_N, enable_tw_preload, enable_pw_preload, num_warps, num_stages, autotune run #1, autotune run #2, autotune run #3
64, 32, True, True, 4, 4, 0.5415679812431335, 0.5409280061721802, 0.5421760082244873
32, 32, False, False, 4, 4, 0.5554879903793335, 0.5499967932701111, 0.5577791929244995
32, 32, False, False, 4, 2, 0.5859839916229248, 0.5849024057388306, 0.5867136120796204
64, 64, False, False, 4, 2, 0.5874879956245422, 0.5867840051651001, 0.588313639163971
64, 64, False, False, 4, 4, 0.5878080129623413, 0.5868480205535889, 0.5890560150146484
32, 32, True,
[plotfi@devgpu007.cco1 ~/fbsource/fbcode (cccdc8f42)]$ buck2 run @mode/opt -c fbcode.disable_re_tests=True //hammer/ops/benchmarks:ragged_hstu_attention_bench -- --bench-backward False
Buck UI: https://www.internalfb.com/buck2/21bc2db6-730a-4d63-8394-41757203dd20
Network: Up: 0B Down: 0B
Jobs completed: 18825. Time elapsed: 0.8s.
BUILD SUCCEEDED
/data/users/plotfi/fbsource/buck-out/v2/gen/fbcode/009ebbab256a7e75/hammer/ops/benchmarks/__ragged_hstu_attention_bench__/ragged_hstu_attention_bench-inplace#link-tree/caffe2/torch/fb/model_transform/splitting/split_dispatcher.py:22: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch.
return torch.library.impl_abstract(qualname)
INFO:aitemplate.backend.build_cache_base:Build cache disabled
INFO:fx2ait.extension:===Load non-OSS AITModel===
INFO:2024-08-21 10:04:29 3561882:3561882 CuptiCallbackApi.cpp:78] Callback: domain = 3,
from torch._inductor.async_compile import AsyncCompile
async_compile = AsyncCompile()
triton_poi_fused__to_copy_add_0 = async_compile.triton('triton_', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor
@plotfi
plotfi / triton-build-script.sh
Created August 1, 2024 19:55
Triton Build Script
#!/bin/bash
################################################################################
echo " ______ _ __ ___ _ __ __ ____ _ __ "
echo "/_ __/___(_) /____ ___ / _ )__ __(_) /__/ / / __/_______(_)__ / /_ "
echo " / / / __/ / __/ _ \/ _ \ / _ / // / / / _ / _\ \/ __/ __/ / _ \/ __/ "
echo "/_/ /_/ /_/\__/\___/_//_/ /____/\_,_/_/_/\_,_/ /___/\__/_/ /_/ .__/\__/ "
echo " /_/ "
################################################################################