bjacob/gist:0242746ba4b90c6c8a0314326b789f80

## gistfile1.txt
// -----// IR Dump After GeneralizeLinalgNamedOps (iree-flow-generalize-linalg-named-ops) //----- //
func.func @quantized_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import %arg0 "input 0" : !hal.buffer_view -> tensor<11008x32x128xi4>
  %1 = hal.tensor.import %arg1 "input 1" : !hal.buffer_view -> tensor<11008x32x1xf32>
  %2 = hal.tensor.import %arg2 "input 2" : !hal.buffer_view -> tensor<11008x32x1xf32>
  %3 = hal.tensor.import %arg3 "input 3" : !hal.buffer_view -> tensor<1x1x32x128xf32>
  %4 = tensor.empty() : tensor<1x1x11008xf32>
  %5 = tensor.empty() : tensor<11008x32x128xf32>
  %6 = linalg.fill ins(%cst : f32) outs(%4 : tensor<1x1x11008xf32>) -> tensor<1x1x11008xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, 0)>, affine_map<(d0, d1, d2) -> (d0, d1, 0)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0, %1, %2 : tensor<11008x32x128xi4>, tensor<11008x32x1xf32>, tensor<11008x32x1xf32>) outs(%5 : tensor<11008x32x128xf32>) {
  ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32):
    %10 = arith.extui %in : i4 to i32
    %11 = arith.uitofp %10 : i32 to f32
    %12 = arith.subf %11, %in_1 : f32
    %13 = arith.mulf %12, %in_0 : f32
    linalg.yield %13 : f32
  } -> tensor<11008x32x128xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<1x1x32x128xf32>, tensor<11008x32x128xf32>) outs(%6 : tensor<1x1x11008xf32>) {
  ^bb0(%in: f32, %in_0: f32, %out: f32):
    %10 = arith.mulf %in, %in_0 : f32
    %11 = arith.addf %10, %out : f32
    linalg.yield %11 : f32
  } -> tensor<1x1x11008xf32>
  %9 = hal.tensor.export %8 "output 0" : tensor<1x1x11008xf32> -> !hal.buffer_view
  return %9 : !hal.buffer_view
}

// -----// IR Dump After FuseDequantizationMatmul (iree-flow-fuse-dequantization-matmul) //----- //
func.func @quantized_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %cst = arith.constant 1.270000e+02 : f32
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0_i16 = arith.constant 0 : i16
  %0 = hal.tensor.import %arg0 "input 0" : !hal.buffer_view -> tensor<11008x32x128xi4>
  %1 = hal.tensor.import %arg1 "input 1" : !hal.buffer_view -> tensor<11008x32x1xf32>
  %2 = hal.tensor.import %arg2 "input 2" : !hal.buffer_view -> tensor<11008x32x1xf32>
  %3 = hal.tensor.import %arg3 "input 3" : !hal.buffer_view -> tensor<1x1x32x128xf32>
  %4 = tensor.empty() : tensor<1x1x11008xf32>
  %5 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<1x1x11008xf32>) -> tensor<1x1x11008xf32>
  %6 = tensor.empty() : tensor<1x1x32xf32>
  %7 = linalg.fill ins(%cst_0 : f32) outs(%6 : tensor<1x1x32xf32>) -> tensor<1x1x32xf32>
  %8 = tensor.empty() : tensor<1x1x32xf32>
  %9 = tensor.empty() : tensor<1x1x32xf32>
  %10 = linalg.fill ins(%cst_0 : f32) outs(%9 : tensor<1x1x32xf32>) -> tensor<1x1x32xf32>
  %11 = tensor.empty() : tensor<1x1x32x128xi8>
  %12:3 = flow.dispatch.region -> (tensor<1x1x32x128xi8>, tensor<1x1x32xf32>, tensor<1x1x32xf32>) {
    %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3 : tensor<1x1x32x128xf32>) outs(%7 : tensor<1x1x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %21 = math.absf %in : f32
      %22 = arith.maxf %21, %out : f32
      linalg.yield %22 : f32
    } -> tensor<1x1x32xf32>
    %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%17 : tensor<1x1x32xf32>) outs(%8 : tensor<1x1x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %21 = arith.divf %in, %cst : f32
      linalg.yield %21 : f32
    } -> tensor<1x1x32xf32>
    %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3 : tensor<1x1x32x128xf32>) outs(%10 : tensor<1x1x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %21 = arith.addf %in, %out : f32
      linalg.yield %21 : f32
    } -> tensor<1x1x32xf32>
    %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %18 : tensor<1x1x32x128xf32>, tensor<1x1x32xf32>) outs(%11 : tensor<1x1x32x128xi8>) {
    ^bb0(%in: f32, %in_1: f32, %out: i8):
      %21 = arith.divf %in, %in_1 : f32
      %22 = arith.fptoui %21 : f32 to i8
      linalg.yield %22 : i8
    } -> tensor<1x1x32x128xi8>
    flow.return %20, %19, %18 : tensor<1x1x32x128xi8>, tensor<1x1x32xf32>, tensor<1x1x32xf32>
  }
  %13 = tensor.empty() : tensor<1x1x11008x32xi16>
  %14 = linalg.fill ins(%c0_i16 : i16) outs(%13 : tensor<1x1x11008x32xi16>) -> tensor<1x1x11008x32xi16>
  %15 = flow.dispatch.region -> (tensor<1x1x11008xf32>) {
    %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%12#0, %0 : tensor<1x1x32x128xi8>, tensor<11008x32x128xi4>) outs(%14 : tensor<1x1x11008x32xi16>) {
    ^bb0(%in: i8, %in_1: i4, %out: i16):
      %19 = arith.extsi %in : i8 to i16
      %20 = arith.extui %in_1 : i4 to i16
      %21 = arith.muli %19, %20 : i16
      %22 = arith.addi %21, %out : i16
      linalg.yield %22 : i16
    } -> tensor<1x1x11008x32xi16>
    %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, 0)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%17, %12#2, %12#1, %1, %2 : tensor<1x1x11008x32xi16>, tensor<1x1x32xf32>, tensor<1x1x32xf32>, tensor<11008x32x1xf32>, tensor<11008x32x1xf32>) outs(%5 : tensor<1x1x11008xf32>) {
    ^bb0(%in: i16, %in_1: f32, %in_2: f32, %in_3: f32, %in_4: f32, %out: f32):
      %19 = arith.extsi %in : i16 to i32
      %20 = arith.sitofp %19 : i32 to f32
      %21 = arith.mulf %20, %in_1 : f32
      %22 = arith.mulf %21, %in_3 : f32
      %23 = arith.mulf %in_4, %in_3 : f32
      %24 = arith.mulf %23, %in_2 : f32
      %25 = arith.subf %22, %24 : f32
      %26 = arith.addf %25, %out : f32
      linalg.yield %26 : f32
    } -> tensor<1x1x11008xf32>
    flow.return %18 : tensor<1x1x11008xf32>
  }
  %16 = hal.tensor.export %15 "output 0" : tensor<1x1x11008xf32> -> !hal.buffer_view
  return %16 : !hal.buffer_view
}
	// -----// IR Dump After GeneralizeLinalgNamedOps (iree-flow-generalize-linalg-named-ops) //----- //
	func.func @quantized_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
	%cst = arith.constant 0.000000e+00 : f32
	%0 = hal.tensor.import %arg0 "input 0" : !hal.buffer_view -> tensor<11008x32x128xi4>
	%1 = hal.tensor.import %arg1 "input 1" : !hal.buffer_view -> tensor<11008x32x1xf32>
	%2 = hal.tensor.import %arg2 "input 2" : !hal.buffer_view -> tensor<11008x32x1xf32>
	%3 = hal.tensor.import %arg3 "input 3" : !hal.buffer_view -> tensor<1x1x32x128xf32>
	%4 = tensor.empty() : tensor<1x1x11008xf32>
	%5 = tensor.empty() : tensor<11008x32x128xf32>
	%6 = linalg.fill ins(%cst : f32) outs(%4 : tensor<1x1x11008xf32>) -> tensor<1x1x11008xf32>
	%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, 0)>, affine_map<(d0, d1, d2) -> (d0, d1, 0)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0, %1, %2 : tensor<11008x32x128xi4>, tensor<11008x32x1xf32>, tensor<11008x32x1xf32>) outs(%5 : tensor<11008x32x128xf32>) {
	^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32):
	%10 = arith.extui %in : i4 to i32
	%11 = arith.uitofp %10 : i32 to f32
	%12 = arith.subf %11, %in_1 : f32
	%13 = arith.mulf %12, %in_0 : f32
	linalg.yield %13 : f32
	} -> tensor<11008x32x128xf32>
	%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<1x1x32x128xf32>, tensor<11008x32x128xf32>) outs(%6 : tensor<1x1x11008xf32>) {
	^bb0(%in: f32, %in_0: f32, %out: f32):
	%10 = arith.mulf %in, %in_0 : f32
	%11 = arith.addf %10, %out : f32
	linalg.yield %11 : f32
	} -> tensor<1x1x11008xf32>
	%9 = hal.tensor.export %8 "output 0" : tensor<1x1x11008xf32> -> !hal.buffer_view
	return %9 : !hal.buffer_view
	}

	// -----// IR Dump After FuseDequantizationMatmul (iree-flow-fuse-dequantization-matmul) //----- //
	func.func @quantized_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
	%cst = arith.constant 1.270000e+02 : f32
	%cst_0 = arith.constant 0.000000e+00 : f32
	%c0_i16 = arith.constant 0 : i16
	%0 = hal.tensor.import %arg0 "input 0" : !hal.buffer_view -> tensor<11008x32x128xi4>
	%1 = hal.tensor.import %arg1 "input 1" : !hal.buffer_view -> tensor<11008x32x1xf32>
	%2 = hal.tensor.import %arg2 "input 2" : !hal.buffer_view -> tensor<11008x32x1xf32>
	%3 = hal.tensor.import %arg3 "input 3" : !hal.buffer_view -> tensor<1x1x32x128xf32>
	%4 = tensor.empty() : tensor<1x1x11008xf32>
	%5 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<1x1x11008xf32>) -> tensor<1x1x11008xf32>
	%6 = tensor.empty() : tensor<1x1x32xf32>
	%7 = linalg.fill ins(%cst_0 : f32) outs(%6 : tensor<1x1x32xf32>) -> tensor<1x1x32xf32>
	%8 = tensor.empty() : tensor<1x1x32xf32>
	%9 = tensor.empty() : tensor<1x1x32xf32>
	%10 = linalg.fill ins(%cst_0 : f32) outs(%9 : tensor<1x1x32xf32>) -> tensor<1x1x32xf32>
	%11 = tensor.empty() : tensor<1x1x32x128xi8>
	%12:3 = flow.dispatch.region -> (tensor<1x1x32x128xi8>, tensor<1x1x32xf32>, tensor<1x1x32xf32>) {
	%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3 : tensor<1x1x32x128xf32>) outs(%7 : tensor<1x1x32xf32>) {
	^bb0(%in: f32, %out: f32):
	%21 = math.absf %in : f32
	%22 = arith.maxf %21, %out : f32
	linalg.yield %22 : f32
	} -> tensor<1x1x32xf32>
	%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%17 : tensor<1x1x32xf32>) outs(%8 : tensor<1x1x32xf32>) {
	^bb0(%in: f32, %out: f32):
	%21 = arith.divf %in, %cst : f32
	linalg.yield %21 : f32
	} -> tensor<1x1x32xf32>
	%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3 : tensor<1x1x32x128xf32>) outs(%10 : tensor<1x1x32xf32>) {
	^bb0(%in: f32, %out: f32):
	%21 = arith.addf %in, %out : f32
	linalg.yield %21 : f32
	} -> tensor<1x1x32xf32>
	%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %18 : tensor<1x1x32x128xf32>, tensor<1x1x32xf32>) outs(%11 : tensor<1x1x32x128xi8>) {
	^bb0(%in: f32, %in_1: f32, %out: i8):
	%21 = arith.divf %in, %in_1 : f32
	%22 = arith.fptoui %21 : f32 to i8
	linalg.yield %22 : i8
	} -> tensor<1x1x32x128xi8>
	flow.return %20, %19, %18 : tensor<1x1x32x128xi8>, tensor<1x1x32xf32>, tensor<1x1x32xf32>
	}
	%13 = tensor.empty() : tensor<1x1x11008x32xi16>
	%14 = linalg.fill ins(%c0_i16 : i16) outs(%13 : tensor<1x1x11008x32xi16>) -> tensor<1x1x11008x32xi16>
	%15 = flow.dispatch.region -> (tensor<1x1x11008xf32>) {
	%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%12#0, %0 : tensor<1x1x32x128xi8>, tensor<11008x32x128xi4>) outs(%14 : tensor<1x1x11008x32xi16>) {
	^bb0(%in: i8, %in_1: i4, %out: i16):
	%19 = arith.extsi %in : i8 to i16
	%20 = arith.extui %in_1 : i4 to i16
	%21 = arith.muli %19, %20 : i16
	%22 = arith.addi %21, %out : i16
	linalg.yield %22 : i16
	} -> tensor<1x1x11008x32xi16>
	%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, 0)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%17, %12#2, %12#1, %1, %2 : tensor<1x1x11008x32xi16>, tensor<1x1x32xf32>, tensor<1x1x32xf32>, tensor<11008x32x1xf32>, tensor<11008x32x1xf32>) outs(%5 : tensor<1x1x11008xf32>) {
	^bb0(%in: i16, %in_1: f32, %in_2: f32, %in_3: f32, %in_4: f32, %out: f32):
	%19 = arith.extsi %in : i16 to i32
	%20 = arith.sitofp %19 : i32 to f32
	%21 = arith.mulf %20, %in_1 : f32
	%22 = arith.mulf %21, %in_3 : f32
	%23 = arith.mulf %in_4, %in_3 : f32
	%24 = arith.mulf %23, %in_2 : f32
	%25 = arith.subf %22, %24 : f32
	%26 = arith.addf %25, %out : f32
	linalg.yield %26 : f32
	} -> tensor<1x1x11008xf32>
	flow.return %18 : tensor<1x1x11008xf32>
	}
	%16 = hal.tensor.export %15 "output 0" : tensor<1x1x11008xf32> -> !hal.buffer_view
	return %16 : !hal.buffer_view
	}