Rob Suderman rsuderman

## gist:18791a2dfe3966b1ff98aacc5d9b825c
// Imported ONNX:
//   torch-mlir-opt input.mlir
module {
  func.func @test_transpose_default(%arg0: !torch.vtensor<[2,3,4],f32>) -> !torch.vtensor<[4,3,2],f32> attributes {torch.onnx_meta.ir_version = 7 : si64, torch.onnx_meta.opset_version = 13 : si64} {
    %0 = torch.operator "onnx.Transpose"(%arg0) : (!torch.vtensor<[2,3,4],f32>) -> !torch.vtensor<[4,3,2],f32>
    return %0 : !torch.vtensor<[4,3,2],f32>
  }
}

// -----

## test.py

from pathlib import Path

import io
import onnx
import torch

from torch._export import capture_pre_autograd_graph
from torch.export import export, ExportedProgram

## gist:c0501ac53eaea45e9216a090faa2db38
func.func @prefill_bs4$async_dispatch_1_generic_4xDx3200_i64xf32() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
  %c32_i64 = arith.constant 32 : i64
  %c0 = arith.constant 0 : index
  %c3200 = arith.constant 3200 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c2 = arith.constant 2 : index
  %c64 = arith.constant 64 : index
  %cst = arith.constant 0.000000e+00 : f16
  %c0_i64 = arith.constant 0 : i64

## gist:4a5837a99147e1fb684667ff74890118
func.func @decode_bs4$async_dispatch_2_elementwise_32000x3200_f16xf32() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
  %c1664 = arith.constant 1664 : index
  %c0 = arith.constant 0 : index
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %c0_i64 = arith.constant 0 : i64
  %cst_0 = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex>
  %cst_1 = arith.constant dense<3200> : vector<4x1x1xindex>
  %cst_2 = arith.constant dense<0.000000e+00> : vector<1x1x4xf32>

## gist:29c7854ef0fd3e96849a28641242abcf
1. Build torch-mlir and with python bindings (setup PYTHON_PATH)

https://github.com/llvm/torch-mlir/blob/main/docs/development.md

2. Import onnx file to mlir

python3 -m torch_mlir.tools.import_onnx model.onnx -o /tmp/onnx.mlir

3. Use `torch-mlir-opt` to convert onnx operators to torch operators

## gist:65aa62b31f5943de8dfa8f9cb84c386f
#map = affine_map<(d0, d1, d2) -> (d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
module @module {
  util.global private @__auto.constant_2048_50_torch.complex64 = dense_resource<__auto.constant_2048_50_torch.complex64> : tensor<2048x50xcomplex<f32>>
  util.global private @__auto.token_embd.weight = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<32000x3200xf16>
  util.global private @__auto.blk.0.attn_norm.weight = #stream.parameter.named<"model"::"blk.0.attn_norm.weight"> : tensor<3200xf32>
  util.global private @__auto.blk.0.attn_q.weight = #stream.parameter.named<"model"::"blk.0.attn_q.weight"> : tensor<3200x3200xf16>
  util.global private @__auto.blk.0.attn_k.weight = #stream.parameter.named<"model"::"blk.0.attn_k.weight"> : tensor<3200x3200xf16>
  util.global private @__auto.blk.0.attn_v.weight = #stream.parameter.named<"model"::"blk.0.attn_v.weight"> : tensor<3200x3200xf16>
  util.global private @__auto.blk.0.attn_output.weight = #stream.parameter.named<"model"::"blk.0.attn_ou

## gist:5c1c58fc3e1e7156f1625375e13cb002

#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> ()>

func.func private @broadcast_scale_widen(
    %value : tensor<4x64x96xf16>, %scale : tensor<f32>) -> tensor<4x64x96xf32> {

    %empty_f32 = tensor.empty() : tensor<4x64x96xf32>

    %scaled = linalg.generic {indexing_maps = [#map0, #map1, #map0], iterator_types = ["parallel", "parallel", "parallel"]}

## gist:b7e326169adb9ed25422d360752f757f

#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> ()>

func.func private @broadcast_scale_widen(
    %value : tensor<4x64x96xf8E4M3FNUZ>, %scale : tensor<f32>) -> tensor<4x64x96xf32> {

    %empty_f32 = tensor.empty() : tensor<4x64x96xf32>

    %scaled = linalg.generic {indexing_maps = [#map0, #map1, #map0], iterator_types = ["parallel", "parallel", "parallel"]}

## gist:cf0703aa2d686da4bbd9cec9e288066f
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d2, d1)>
#map2 = affine_map<(d0, d1, d2) -> ()>
#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d1, 0)>
module @module {
  ml_program.global private mutable @global_seed(dense<0> : tensor<i64>) : tensor<i64>
  func.func @main(%arg0: tensor<4x64x32xf8E4M3FNUZ>, %arg1: tensor<4x64x32xf8E4M3FNUZ>, %arg2: tensor<4x64x32xf8E4M3FNUZ>, %arg3: tensor<f32>, %arg4: tensor<f32>, %arg5: tensor<f32>, %arg6: tensor<f32>) -> tensor<4x64x32xf8E4M3FNUZ> {
    %cst = arith.constant 0.000000e+00 : f32
    %c0_i64 = arith.constant 0 : i64

## gist:ca2dbf8d998e34c4880a51fb94fceb85
import matplotlib.pyplot as plt
import torch

A_SHAPE = (8, 128)
B_SHAPE = (16, 128)

torch.manual_seed(12345)
A_QUANT = torch.rand((A_SHAPE[0],1), dtype=torch.float)
B_QUANT = torch.rand((B_SHAPE[0],1), dtype=torch.float)
	// Imported ONNX:
	// torch-mlir-opt input.mlir
	module {
	func.func @test_transpose_default(%arg0: !torch.vtensor<[2,3,4],f32>) -> !torch.vtensor<[4,3,2],f32> attributes {torch.onnx_meta.ir_version = 7 : si64, torch.onnx_meta.opset_version = 13 : si64} {
	%0 = torch.operator "onnx.Transpose"(%arg0) : (!torch.vtensor<[2,3,4],f32>) -> !torch.vtensor<[4,3,2],f32>
	return %0 : !torch.vtensor<[4,3,2],f32>
	}
	}

	// -----

	from pathlib import Path

	import io
	import onnx
	import torch

	from torch._export import capture_pre_autograd_graph
	from torch.export import export, ExportedProgram
	func.func @prefill_bs4$async_dispatch_1_generic_4xDx3200_i64xf32() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
	%c32_i64 = arith.constant 32 : i64
	%c0 = arith.constant 0 : index
	%c3200 = arith.constant 3200 : index
	%c1 = arith.constant 1 : index
	%c4 = arith.constant 4 : index
	%c2 = arith.constant 2 : index
	%c64 = arith.constant 64 : index
	%cst = arith.constant 0.000000e+00 : f16
	%c0_i64 = arith.constant 0 : i64
	func.func @decode_bs4$async_dispatch_2_elementwise_32000x3200_f16xf32() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
	%c1664 = arith.constant 1664 : index
	%c0 = arith.constant 0 : index
	%c4 = arith.constant 4 : index
	%c1 = arith.constant 1 : index
	%cst = arith.constant 0.000000e+00 : f16
	%c0_i64 = arith.constant 0 : i64
	%cst_0 = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex>
	%cst_1 = arith.constant dense<3200> : vector<4x1x1xindex>
	%cst_2 = arith.constant dense<0.000000e+00> : vector<1x1x4xf32>
	1. Build torch-mlir and with python bindings (setup PYTHON_PATH)

	https://github.com/llvm/torch-mlir/blob/main/docs/development.md

	2. Import onnx file to mlir

	python3 -m torch_mlir.tools.import_onnx model.onnx -o /tmp/onnx.mlir

	3. Use `torch-mlir-opt` to convert onnx operators to torch operators
	#map = affine_map<(d0, d1, d2) -> (d1, d2)>
	#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
	module @module {
	util.global private @__auto.constant_2048_50_torch.complex64 = dense_resource<__auto.constant_2048_50_torch.complex64> : tensor<2048x50xcomplex<f32>>
	util.global private @__auto.token_embd.weight = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<32000x3200xf16>
	util.global private @__auto.blk.0.attn_norm.weight = #stream.parameter.named<"model"::"blk.0.attn_norm.weight"> : tensor<3200xf32>
	util.global private @__auto.blk.0.attn_q.weight = #stream.parameter.named<"model"::"blk.0.attn_q.weight"> : tensor<3200x3200xf16>
	util.global private @__auto.blk.0.attn_k.weight = #stream.parameter.named<"model"::"blk.0.attn_k.weight"> : tensor<3200x3200xf16>
	util.global private @__auto.blk.0.attn_v.weight = #stream.parameter.named<"model"::"blk.0.attn_v.weight"> : tensor<3200x3200xf16>
	util.global private @__auto.blk.0.attn_output.weight = #stream.parameter.named<"model"::"blk.0.attn_ou

	#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
	#map1 = affine_map<(d0, d1, d2) -> ()>

	func.func private @broadcast_scale_widen(
	%value : tensor<4x64x96xf16>, %scale : tensor<f32>) -> tensor<4x64x96xf32> {

	%empty_f32 = tensor.empty() : tensor<4x64x96xf32>

	%scaled = linalg.generic {indexing_maps = [#map0, #map1, #map0], iterator_types = ["parallel", "parallel", "parallel"]}
	#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
	#map1 = affine_map<(d0, d1, d2) -> (d0, d2, d1)>
	#map2 = affine_map<(d0, d1, d2) -> ()>
	#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
	#map4 = affine_map<(d0, d1, d2) -> (d0, d1, 0)>
	module @module {
	ml_program.global private mutable @global_seed(dense<0> : tensor<i64>) : tensor<i64>
	func.func @main(%arg0: tensor<4x64x32xf8E4M3FNUZ>, %arg1: tensor<4x64x32xf8E4M3FNUZ>, %arg2: tensor<4x64x32xf8E4M3FNUZ>, %arg3: tensor<f32>, %arg4: tensor<f32>, %arg5: tensor<f32>, %arg6: tensor<f32>) -> tensor<4x64x32xf8E4M3FNUZ> {
	%cst = arith.constant 0.000000e+00 : f32
	%c0_i64 = arith.constant 0 : i64
	import matplotlib.pyplot as plt
	import torch

	A_SHAPE = (8, 128)
	B_SHAPE = (16, 128)

	torch.manual_seed(12345)
	A_QUANT = torch.rand((A_SHAPE[0],1), dtype=torch.float)
	B_QUANT = torch.rand((B_SHAPE[0],1), dtype=torch.float)