Skip to content

Instantly share code, notes, and snippets.

@rsuderman
rsuderman / ir.mlir
Last active September 3, 2025 20:20
module @module {
util.global private @__auto.token_embd.weight = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.0.ffn_gate.weight = #stream.parameter.named<"model"::"blk.0.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.0.ffn_up.weight = #stream.parameter.named<"model"::"blk.0.ffn_up.weight"> : tensor<14336x4096xf16>
util.func public @prefill_bs4$async(%arg0: !hal.buffer_view, %arg1: !hal.fence, %arg2: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%c1 = arith.constant 1 : index
%c14336 = arith.constant 14336 : index
%c0_i64 = arith.constant 0 : i64
%cst = arith.constant 0.000000e+00 : f32
%c4 = arith.constant 4 : index
module @module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree_codegen.target_info = #iree_gpu.target<arch = "gfx1100", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<WMMAR3_F32_16x16x16_F16>, <WMMAR3_F16_16x16x16_F16>, <WMMAR3_F32_16x16x16_BF16>, <WMMAR3_BF16_16x16x16_BF16>, <WMMAR3_I32_16x16x16_I8>], subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 8192>>, ukernels = "none"}>]> : !hal.device
util.global private @__auto.token_embd.weight : !stream.resource<constant>
util.initializer {
%c117440512 = arith.con
@rsuderman
rsuderman / model.mlir
Created July 7, 2025 21:14
Execution Schedule Failure
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
module @module {
util.global private @__auto.token_embd.weight {stream.affinity = #hal.device.promise<@__device_0>} = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<256x256xf16>
util.global private @__auto.token_embd.weight$1 {stream.affinity = #hal.device.promise<@__device_1>} = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<256x256xf16>
util.global private @__auto.blk.0.attn_norm.weight {stream.affinity = #hal.device.promise<@__device_0>} = #stream.parameter.named<"model"::"blk.0.attn_norm.weight"> : tensor<256xf32>
util.global private @__auto.blk.0.attn_norm.weight$1 {stream.affinity = #hal.device.promise<@__device_1>} = #stream.parameter.named<"model"::"blk.0.attn_norm.weight"> : tensor<256xf32>
util.global private @__auto.blk.0.attn_q.weight.shard.0 {stream.affinity = #hal.device.promise<@__device_0>} = #stream.parameter.named<"model"::"blk
import numpy
BS=8
BLOCKS=33
BLOCK_SIZE=32
CACHE_SIZE=2621440
PAGES=BLOCKS*BS + 10
arg0 = numpy.zeros((BS,1), dtype=numpy.int64)
module @module {
util.global private @__auto.token_embd.weight = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<131072x5120xf16>
util.global private @__auto.blk.0.attn_norm.weight = #stream.parameter.named<"model"::"blk.0.attn_norm.weight"> : tensor<5120xf32>
util.global private @__auto.blk.0.attn_q.weight = #stream.parameter.named<"model"::"blk.0.attn_q.weight"> : tensor<4096x5120xf16>
util.global private @__auto.blk.0.attn_k.weight = #stream.parameter.named<"model"::"blk.0.attn_k.weight"> : tensor<1024x5120xf16>
util.global private @__auto.blk.0.attn_v.weight = #stream.parameter.named<"model"::"blk.0.attn_v.weight"> : tensor<1024x5120xf16>
util.global private @__auto.blk.0.attn_output.weight = #stream.parameter.named<"model"::"blk.0.attn_output.weight"> : tensor<5120x4096xf16>
util.global private @__auto.blk.0.ffn_norm.weight = #stream.parameter.named<"model"::"blk.0.ffn_norm.weight"> : tensor<5120xf32>
util.global private @__auto.blk.0.ffn_gate.weight = #stream.parameter.nam
module @module {
util.global private @__auto.token_embd.weight = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<131072x5120xf16>
util.global private @__auto.blk.0.attn_norm.weight = #stream.parameter.named<"model"::"blk.0.attn_norm.weight"> : tensor<5120xf32>
util.global private @__auto.blk.0.attn_q.weight = #stream.parameter.named<"model"::"blk.0.attn_q.weight"> : tensor<4096x5120xf16>
util.global private @__auto.blk.0.attn_k.weight = #stream.parameter.named<"model"::"blk.0.attn_k.weight"> : tensor<1024x5120xf16>
util.global private @__auto.blk.0.attn_v.weight = #stream.parameter.named<"model"::"blk.0.attn_v.weight"> : tensor<1024x5120xf16>
util.global private @__auto.blk.0.attn_output.weight = #stream.parameter.named<"model"::"blk.0.attn_output.weight"> : tensor<5120x4096xf16>
util.global private @__auto.blk.0.ffn_norm.weight = #stream.parameter.named<"model"::"blk.0.ffn_norm.weight"> : tensor<5120xf32>
util.global private @__auto.blk.0.ffn_gate.weight = #stream.parameter.nam
module @module {
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view, %arg4: !hal.fence, %arg5: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 1.250000e-01 : f32
%cst_0 = arith.constant 0xFF800000 : f32
%cst_1 = arith.constant 0.000000e+00 : f32
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
%1 = hal.tensor.import wait(%arg4) => %arg0 : !hal.buffer_view -> tensor<4x?x64xf32>{%0}
%2 = hal.tensor.import wait(%arg4) => %arg1 : !hal.buffer_view -> tensor<1024x64xf32>
module @module {
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view, %arg3: !hal.fence, %arg4: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 1.250000e-01 : f32
%cst_0 = arith.constant 0xFF800000 : f32
%cst_1 = arith.constant 0.000000e+00 : f32
%c2 = arith.constant 2 : index
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
%1 = hal.tensor.import wait(%arg3) => %arg0 : !hal.buffer_view -> tensor<4x16x?x64xf32>{%0}
%2 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[2] : index
%3 = hal.tensor.import wait(%arg3) => %arg1 : !hal.buffer_view -> tensor<4x16x?x64xf32>{%2}
This file has been truncated, but you can view the full file.
module @module {
util.global private @__auto.token_embd.weight = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<131072x5120xf16>
util.global private @__auto.blk.0.attn_norm.weight = #stream.parameter.named<"model"::"blk.0.attn_norm.weight"> : tensor<5120xf32>
util.global private @__auto.blk.0.attn_q.weight = #stream.parameter.named<"model"::"blk.0.attn_q.weight"> : tensor<4096x5120xf16>
util.global private @__auto.blk.0.attn_k.weight = #stream.parameter.named<"model"::"blk.0.attn_k.weight"> : tensor<1024x5120xf16>
util.global private @__auto.blk.0.attn_v.weight = #stream.parameter.named<"model"::"blk.0.attn_v.weight"> : tensor<1024x5120xf16>
util.global private @__auto.blk.0.attn_output.weight = #stream.parameter.named<"model"::"blk.0.attn_output.weight"> : tensor<5120x4096xf16>
util.global private @__auto.blk.0.ffn_norm.weight = #stream.parameter.named<"model"::"blk.0.ffn_norm.weight"> : tensor<5120xf32>
util.global private @__auto.blk.0.ffn_gate.weight = #stream.parameter.nam
module @module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", {ordinal = 0 : index}, [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx1100", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<WMMAR3_F32_16x16x16_F16>, <WMMAR3_F16_16x16x16_F16>, <WMMAR3_F32_16x16x16_BF16>, <WMMAR3_BF16_16x16x16_BF16>, <WMMAR3_I32_16x16x16_I8>], subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 8192>>, ukernels = "none"}>]> : !hal.device
stream.executable private @main$async_dispatch_0 {
stream.executable.export public @main$async_dispatch_0