Rob Suderman rsuderman

## ir.mlir
module @module {
  util.global private @__auto.token_embd.weight = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.0.ffn_gate.weight = #stream.parameter.named<"model"::"blk.0.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.0.ffn_up.weight = #stream.parameter.named<"model"::"blk.0.ffn_up.weight"> : tensor<14336x4096xf16>
  util.func public @prefill_bs4$async(%arg0: !hal.buffer_view, %arg1: !hal.fence, %arg2: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %c1 = arith.constant 1 : index
    %c14336 = arith.constant 14336 : index
    %c0_i64 = arith.constant 0 : i64
    %cst = arith.constant 0.000000e+00 : f32
    %c4 = arith.constant 4 : index

## model.stream.mlir
module @module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree_codegen.target_info = #iree_gpu.target<arch = "gfx1100", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<WMMAR3_F32_16x16x16_F16>, <WMMAR3_F16_16x16x16_F16>, <WMMAR3_F32_16x16x16_BF16>, <WMMAR3_BF16_16x16x16_BF16>, <WMMAR3_I32_16x16x16_I8>], subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 8192>>, ukernels = "none"}>]> : !hal.device
  util.global private @__auto.token_embd.weight : !stream.resource<constant>
  util.initializer {
    %c117440512 = arith.con

## model.mlir
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
module @module {
  util.global private @__auto.token_embd.weight {stream.affinity = #hal.device.promise<@__device_0>} = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<256x256xf16>
  util.global private @__auto.token_embd.weight$1 {stream.affinity = #hal.device.promise<@__device_1>} = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<256x256xf16>
  util.global private @__auto.blk.0.attn_norm.weight {stream.affinity = #hal.device.promise<@__device_0>} = #stream.parameter.named<"model"::"blk.0.attn_norm.weight"> : tensor<256xf32>
  util.global private @__auto.blk.0.attn_norm.weight$1 {stream.affinity = #hal.device.promise<@__device_1>} = #stream.parameter.named<"model"::"blk.0.attn_norm.weight"> : tensor<256xf32>
  util.global private @__auto.blk.0.attn_q.weight.shard.0 {stream.affinity = #hal.device.promise<@__device_0>} = #stream.parameter.named<"model"::"blk

## gist:3faacdf8a7c8bb5a0fed1a7b70e27474
import numpy

BS=8
BLOCKS=33
BLOCK_SIZE=32
CACHE_SIZE=2621440
PAGES=BLOCKS*BS + 10


arg0 = numpy.zeros((BS,1), dtype=numpy.int64)

## mistral_nemo.input.mlir
module @module {
  util.global private @__auto.token_embd.weight = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<131072x5120xf16>
  util.global private @__auto.blk.0.attn_norm.weight = #stream.parameter.named<"model"::"blk.0.attn_norm.weight"> : tensor<5120xf32>
  util.global private @__auto.blk.0.attn_q.weight = #stream.parameter.named<"model"::"blk.0.attn_q.weight"> : tensor<4096x5120xf16>
  util.global private @__auto.blk.0.attn_k.weight = #stream.parameter.named<"model"::"blk.0.attn_k.weight"> : tensor<1024x5120xf16>
  util.global private @__auto.blk.0.attn_v.weight = #stream.parameter.named<"model"::"blk.0.attn_v.weight"> : tensor<1024x5120xf16>
  util.global private @__auto.blk.0.attn_output.weight = #stream.parameter.named<"model"::"blk.0.attn_output.weight"> : tensor<5120x4096xf16>
  util.global private @__auto.blk.0.ffn_norm.weight = #stream.parameter.named<"model"::"blk.0.ffn_norm.weight"> : tensor<5120xf32>
  util.global private @__auto.blk.0.ffn_gate.weight = #stream.parameter.nam

## mistral_nemo.input.mlir
module @module {
  util.global private @__auto.token_embd.weight = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<131072x5120xf16>
  util.global private @__auto.blk.0.attn_norm.weight = #stream.parameter.named<"model"::"blk.0.attn_norm.weight"> : tensor<5120xf32>
  util.global private @__auto.blk.0.attn_q.weight = #stream.parameter.named<"model"::"blk.0.attn_q.weight"> : tensor<4096x5120xf16>
  util.global private @__auto.blk.0.attn_k.weight = #stream.parameter.named<"model"::"blk.0.attn_k.weight"> : tensor<1024x5120xf16>
  util.global private @__auto.blk.0.attn_v.weight = #stream.parameter.named<"model"::"blk.0.attn_v.weight"> : tensor<1024x5120xf16>
  util.global private @__auto.blk.0.attn_output.weight = #stream.parameter.named<"model"::"blk.0.attn_output.weight"> : tensor<5120x4096xf16>
  util.global private @__auto.blk.0.ffn_norm.weight = #stream.parameter.named<"model"::"blk.0.ffn_norm.weight"> : tensor<5120xf32>
  util.global private @__auto.blk.0.ffn_gate.weight = #stream.parameter.nam

## gqa_broken.mlir
module @module {
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view, %arg4: !hal.fence, %arg5: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 1.250000e-01 : f32
    %cst_0 = arith.constant 0xFF800000 : f32
    %cst_1 = arith.constant 0.000000e+00 : f32
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %1 = hal.tensor.import wait(%arg4) => %arg0 : !hal.buffer_view -> tensor<4x?x64xf32>{%0}
    %2 = hal.tensor.import wait(%arg4) => %arg1 : !hal.buffer_view -> tensor<1024x64xf32>

## gqa.mlir
module @module {
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view, %arg3: !hal.fence, %arg4: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 1.250000e-01 : f32
    %cst_0 = arith.constant 0xFF800000 : f32
    %cst_1 = arith.constant 0.000000e+00 : f32
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %1 = hal.tensor.import wait(%arg3) => %arg0 : !hal.buffer_view -> tensor<4x16x?x64xf32>{%0}
    %2 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[2] : index
    %3 = hal.tensor.import wait(%arg3) => %arg1 : !hal.buffer_view -> tensor<4x16x?x64xf32>{%2}

## nemo.mlir
module @module {
  util.global private @__auto.token_embd.weight = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<131072x5120xf16>
  util.global private @__auto.blk.0.attn_norm.weight = #stream.parameter.named<"model"::"blk.0.attn_norm.weight"> : tensor<5120xf32>
  util.global private @__auto.blk.0.attn_q.weight = #stream.parameter.named<"model"::"blk.0.attn_q.weight"> : tensor<4096x5120xf16>
  util.global private @__auto.blk.0.attn_k.weight = #stream.parameter.named<"model"::"blk.0.attn_k.weight"> : tensor<1024x5120xf16>
  util.global private @__auto.blk.0.attn_v.weight = #stream.parameter.named<"model"::"blk.0.attn_v.weight"> : tensor<1024x5120xf16>
  util.global private @__auto.blk.0.attn_output.weight = #stream.parameter.named<"model"::"blk.0.attn_output.weight"> : tensor<5120x4096xf16>
  util.global private @__auto.blk.0.ffn_norm.weight = #stream.parameter.named<"model"::"blk.0.ffn_norm.weight"> : tensor<5120xf32>
  util.global private @__auto.blk.0.ffn_gate.weight = #stream.parameter.nam

## scatter_fuse_math.stream.mlir
module @module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", {ordinal = 0 : index}, [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx1100", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<WMMAR3_F32_16x16x16_F16>, <WMMAR3_F16_16x16x16_F16>, <WMMAR3_F32_16x16x16_BF16>, <WMMAR3_BF16_16x16x16_BF16>, <WMMAR3_I32_16x16x16_I8>], subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 8192>>, ukernels = "none"}>]> : !hal.device
  stream.executable private @main$async_dispatch_0 {
    stream.executable.export public @main$async_dispatch_0
	module @module {
	util.global private @__auto.token_embd.weight = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<14336x4096xf16>
	util.global private @__auto.blk.0.ffn_gate.weight = #stream.parameter.named<"model"::"blk.0.ffn_gate.weight"> : tensor<14336x4096xf16>
	util.global private @__auto.blk.0.ffn_up.weight = #stream.parameter.named<"model"::"blk.0.ffn_up.weight"> : tensor<14336x4096xf16>
	util.func public @prefill_bs4$async(%arg0: !hal.buffer_view, %arg1: !hal.fence, %arg2: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
	%c1 = arith.constant 1 : index
	%c14336 = arith.constant 14336 : index
	%c0_i64 = arith.constant 0 : i64
	%cst = arith.constant 0.000000e+00 : f32
	%c4 = arith.constant 4 : index
	module @module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
	util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree_codegen.target_info = #iree_gpu.target<arch = "gfx1100", features = "", wgp = <compute = fp64\|fp32\|fp16\|int64\|int32\|int16\|int8, storage = b64\|b32\|b16\|b8, subgroup = shuffle\|arithmetic, dot = dp4xi8toi32, mma = [<WMMAR3_F32_16x16x16_F16>, <WMMAR3_F16_16x16x16_F16>, <WMMAR3_F32_16x16x16_BF16>, <WMMAR3_BF16_16x16x16_BF16>, <WMMAR3_I32_16x16x16_I8>], subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 8192>>, ukernels = "none"}>]> : !hal.device
	util.global private @__auto.token_embd.weight : !stream.resource<constant>
	util.initializer {
	%c117440512 = arith.con
	#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>
	#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
	module @module {
	util.global private @__auto.token_embd.weight {stream.affinity = #hal.device.promise<@__device_0>} = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<256x256xf16>
	util.global private @__auto.token_embd.weight$1 {stream.affinity = #hal.device.promise<@__device_1>} = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<256x256xf16>
	util.global private @__auto.blk.0.attn_norm.weight {stream.affinity = #hal.device.promise<@__device_0>} = #stream.parameter.named<"model"::"blk.0.attn_norm.weight"> : tensor<256xf32>
	util.global private @__auto.blk.0.attn_norm.weight$1 {stream.affinity = #hal.device.promise<@__device_1>} = #stream.parameter.named<"model"::"blk.0.attn_norm.weight"> : tensor<256xf32>
	util.global private @__auto.blk.0.attn_q.weight.shard.0 {stream.affinity = #hal.device.promise<@__device_0>} = #stream.parameter.named<"model"::"blk
	import numpy

	BS=8
	BLOCKS=33
	BLOCK_SIZE=32
	CACHE_SIZE=2621440
	PAGES=BLOCKS*BS + 10


	arg0 = numpy.zeros((BS,1), dtype=numpy.int64)
	module @module {
	util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view, %arg4: !hal.fence, %arg5: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
	%cst = arith.constant 1.250000e-01 : f32
	%cst_0 = arith.constant 0xFF800000 : f32
	%cst_1 = arith.constant 0.000000e+00 : f32
	%c2 = arith.constant 2 : index
	%c1 = arith.constant 1 : index
	%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
	%1 = hal.tensor.import wait(%arg4) => %arg0 : !hal.buffer_view -> tensor<4x?x64xf32>{%0}
	%2 = hal.tensor.import wait(%arg4) => %arg1 : !hal.buffer_view -> tensor<1024x64xf32>
	module @module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
	util.global private @__device_0 = #hal.device.target<"hip", {ordinal = 0 : index}, [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx1100", features = "", wgp = <compute = fp64\|fp32\|fp16\|int64\|int32\|int16\|int8, storage = b64\|b32\|b16\|b8, subgroup = shuffle\|arithmetic, dot = dp4xi8toi32, mma = [<WMMAR3_F32_16x16x16_F16>, <WMMAR3_F16_16x16x16_F16>, <WMMAR3_F32_16x16x16_BF16>, <WMMAR3_BF16_16x16x16_BF16>, <WMMAR3_I32_16x16x16_I8>], subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 8192>>, ukernels = "none"}>]> : !hal.device
	stream.executable private @main$async_dispatch_0 {
	stream.executable.export public @main$async_dispatch_0